def test_small_input(): err_mod = kde.KDErrorModel('data/ecoli.npz') ref_genome = SeqRecord( Seq(str('AAAAACCCCC'), IUPAC.unambiguous_dna ), id='my_genome', description='test genome' ) generator.simulate_read(ref_genome, err_mod, 1, 0)
def test_kde_short(): if sys.version_info > (3, ): random.seed(42) np.random.seed(42) err_mod = kde.KDErrorModel('data/ecoli.npz') ref_genome = SeqRecord(Seq(str('AAACC' * 100), IUPAC.unambiguous_dna), id='my_genome', description='test genome') read_tuple = generator.simulate_read(ref_genome, err_mod, 1) big_read = ''.join(str(read_tuple[0].seq) + str(read_tuple[1].seq)) assert big_read == 'ACCAAACCAAACCAAACCAAGGTTTGGTTTGGTTTGGTGT'
def test_basic(): if sys.version_info > (3, ): random.seed(42) np.random.seed(42) err_mod = basic.BasicErrorModel() ref_genome = SeqRecord(Seq(str('AAAAACCCCC' * 100), IUPAC.unambiguous_dna), id='my_genome', description='test genome') read_tuple = generator.simulate_read(ref_genome, err_mod, 1) big_read = ''.join(str(read_tuple[0].seq) + str(read_tuple[1].seq)) assert big_read[-15:] == 'TTTTGGGGGTTTTTG'
sigma=community['stdev_log_abund']) community['sim_abund'].to_csv(abund_path, sep='\t') replicon['sim_copies'] = (replicon['genome_id'].map(community['sim_abund']) * replicon['replicon_type'] .map({'chromosome': 1, 'plasmid': PLASMID_MULTI}) .fillna(1)) replicon['sim_nucleotides'] = replicon['sim_copies'] * replicon['size'] replicon['sim_pread'] = (replicon['sim_nucleotides'] / replicon['sim_nucleotides'].sum()) replicon['sim_nreads'] = np.random.multinomial(nreads, replicon['sim_pread']) print(replicon[['genbank_id', 'size', 'sim_nreads']], file=sys.stderr) print('Simulating metagenome reads.', file=sys.stderr) model_path = os.path.join(iss.__path__[0], 'profiles', ERR_MODEL_NAME) err_model = KDErrorModel(model_path) with open(r1_path, 'w') as handle1, open(r2_path, 'w') as handle2: reads_so_far = 0 for genome_id, g in replicon.groupby('genome_id'): print(f'Simulating reads for genome {genome_id}', file=sys.stderr) genome_path = os.path.join(genome_dir, genome_id + '.fn') seqs = seq_file_index(genome_path, 'fasta') for replicon_id, r in g.iterrows(): print(f'Simulating reads for replicon {replicon_id}', file=sys.stderr) record = seqs[r.genbank_id] for i in tqdm(range(r.sim_nreads), initial=reads_so_far, total=nreads): paired_reads = simulate_read(record, err_model, i) write_seq(paired_reads[0], handle1, 'fastq-sanger') write_seq(paired_reads[1], handle2, 'fastq-sanger') reads_so_far += r.sim_nreads