def test_run(capsys=None): r1 = os.path.join(TEST_DIR, "read1.fq") r2 = os.path.join(TEST_DIR, "read2.fq") ref = os.path.join(TEST_DIR, "lambda.fa") wgsim.core(r1=r1, r2=r2, ref=ref, N=1000, indel_frac=0.5, seed=1) assert 1
def cmd(genome, read1="read1.fq", read2="read2.fq", err=0.02, dist=500, stdev=50, num=1000, L1=70, L2=70, mut=0.001, frac=0.15, ext=0.25, seed=0, amb=0.05, fixed=False, version=False): """ Short read simulator for paired end reads based on wgsim. """ # Sanity check on the parameters. seed = 0 if not seed else int(seed) fixed = int(fixed) locs = locals() def check_fraction(label): value = locs.get(label) if not (0 <= value <= 1): error("%s=%s, it must be in the range [0, 1]" % (label, value)) # Check parameters to be valid fractions. for name in ["err", "mut", "frac"]: check_fraction(name) wgsim.core(ref=genome, r1=read1, r2=read2, err_rate=err, dist=dist, stdev=stdev, N=num, mut_rate=mut, indel_frac=frac, indel_ext=ext, size_l=L1, size_r=L2, max_n=amb, seed=seed, is_fixed=fixed) pass
def StrandSim(w, c): ''' Perform first part of strand-seq simulations and re-align to the original haplotype ''' hfa = pyfaidx.Fasta(c.ffile) if w.chrom not in hfa.keys(): now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Warning] Chromosome ' + w.chrom + ' not found in ' + c.ffile + '. Skipped simulation') else: now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Preparing simulation from ' + c.ffile + '. Haplotype ' + str(c.hapnumber)) chr_ = hfa[w.chrom] seq_ = chr_[w.start - 1:w.end].seq tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa') region = w.chrom + '_' + str(w.start) + '_' + str(w.end) with open(tmpfa, 'w') as tmpfout: #write temporary fa for sampling reads tmpfout.write('>' + region + '\n' + '\n'.join(re.findall('.{1,60}', seq_)) + '\n') Ns = seq_.count('N') #normalize coverage on Ns Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) / 2) #for paired-end sequencing mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq') mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq') hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulated coverage for this region will be ' + str(hapcov)) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulating') wgsim.core(r1=mate1h, r2=mate2h, ref=tmpfa, err_rate=c.error, mut_rate=c.mutation, indel_frac=c.indels, indel_ext=c.extindels, N=Nreads, dist=c.distance, stdev=c.stdev, size_l=c.length, size_r=c.length, max_n=0.05, is_hap=0, is_fixed=0, seed=0) os.remove(tmpfa) mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq') mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq') with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2: for (name1, seq1, qual1), (name2, seq2, qual2) in zip(mp.fastx_read(mate1h), mp.fastx_read(mate2h)): #change name1/name2 newname1 = '@c' + str(c.singlecellnum) + 'h' + str( c.hapnumber) + 'fh_' + name1 newname2 = '@c' + str(c.singlecellnum) + 'h' + str( c.hapnumber) + 'fh_' + name2 read1 = [newname1, seq1, '+', qual1] read2 = [newname2, seq2, '+', qual2] out1.write('\n'.join(read1) + '\n') out2.write('\n'.join(read2) + '\n') os.remove(mate1h) os.remove(mate2h) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Mapping simulated reads to the corresponding haplotype' ) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), c.ffile, mate1hnew, mate2hnew ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(mate1hnew) os.remove(mate2hnew) #now re-parse BAM file to keep only Watson/Crick reads #Watson reads: read1 forward, read2 reverse #Crick reads: read2 forward, read1 reverse ivf = None if len(c.sce_bedregion) != 0: sce_string = '' for s in c.sce_bedregion: if s[3] == c.cellid and s[4] == c.hapid: sce_string += s.chrom + '\t' + str(s.start) + '\t' + str( s.end) + '\n' if sce_string != '': sce_fromscratch = pybedtools.BedTool(sce_string.rstrip(), from_string=True) ivf = sce_fromscratch.as_intervalfile( ) #intervals where to perform SCE events now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Detected one ore more SCE event for current cell/haplotype' ) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Extracting Watson (R1F,R2R) and Crick (R1R,R2F) reads') save = pysam.set_verbosity(0) bamstrand = pysam.AlignmentFile( BAM, 'rb', require_index=False) #until-eof consumes the bamfile pysam.set_verbosity(save) Wreads = list(WR(bamstrand, ivf)) bamstrand.close() save = pysam.set_verbosity(0) bamstrand = pysam.AlignmentFile( BAM, 'rb', require_index=False) #re-open for second round pysam.set_verbosity(save) Creads = list(CR(bamstrand, ivf)) bamstrand.close() os.remove(BAM) if c.noise > 0: now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Adding noise to strands') CtoW = random.sample(Creads, round(len(Wreads) / 100 * c.noise)) Wreads += CtoW WtoC = random.sample(Wreads, round(len(Creads) / 100 * c.noise)) Creads += WtoC now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Writing Watson and Crick FASTQ') w1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w1.fq') w2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w2.fq') c1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c1.fq') c2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c2.fq') with open(w1, 'w') as wout1, open(w2, 'w') as wout2: for r1, r2 in Wreads: if r1.get_tag('OS') == 'W': #this is true W read1 = [ '@' + r1.query_name, r1.query_sequence, '+', '2' * c.length ] read2 = [ '@' + r2.query_name, mp.revcomp(r2.query_sequence), '+', '2' * c.length ] else: #write to Watson, but is Crick read1 = [ '@' + r1.query_name, mp.revcomp(r1.query_sequence), '+', '2' * c.length ] read2 = [ '@' + r2.query_name, r2.query_sequence, '+', '2' * c.length ] wout1.write('\n'.join(read1) + '\n') wout2.write('\n'.join(read2) + '\n') with open(c1, 'w') as cout1, open(c2, 'w') as cout2: for r1, r2 in Creads: if r1.get_tag('OS') == 'C': #this is true C read1 = [ '@' + r1.query_name, mp.revcomp(r1.query_sequence), '+', '2' * c.length ] read2 = [ '@' + r2.query_name, r2.query_sequence, '+', '2' * c.length ] else: #write to Crick, but is Watson read1 = [ '@' + r1.query_name, r1.query_sequence, '+', '2' * c.length ] read2 = [ '@' + r2.query_name, mp.revcomp(r2.query_sequence), '+', '2' * c.length ] cout1.write('\n'.join(read1) + '\n') cout2.write('\n'.join(read2) + '\n') now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Mapping Watson and Crick reads to the original reference' ) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.W.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, w1, w2 ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(w1) os.remove(w2) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.C.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, c1, c2 ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(c1) os.remove(c2)
def BulkSim(w, c): ''' Perform bulk simulations and re-align to the un-modified reference ''' hfa = pyfaidx.Fasta(c.ffile) if w.chrom not in hfa.keys(): now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Warning] Chromosome ' + w.chrom + ' not found in ' + c.ffile + '. Skipped simulation') else: now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Preparing simulation from ' + c.ffile + '. Clone ' + str(c.clonenumber) + '. Haplotype ' + str(c.hapnumber)) chr_ = hfa[w.chrom] seq_ = chr_[w.start - 1:w.end].seq tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa') region = w.chrom + '_' + str(w.start) + '_' + str(w.end) with open(tmpfa, 'w') as tmpfout: #write temporary fa for sampling reads tmpfout.write('>' + region + '\n' + '\n'.join(re.findall('.{1,60}', seq_)) + '\n') Ns = seq_.count('N') #normalize coverage on Ns Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) / 2) #for paired-end sequencing mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq') mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq') if float(w[4]) < 100.0: tmpref = os.path.abspath(c.haplodir + '/' + 'rtmp.fa') seq__ = c.refall[w.chrom][w.start - 1:w.end].seq with open(tmpref, 'w') as tmpfout: #write temporary fa for sampling reads tmpfout.write('>' + region + '\n' + '\n'.join(re.findall('.{1,60}', seq__)) + '\n') #simulate part from reference and part from haplotype haploreadsN = round(Nreads / 100 * float(w[4])) hapcov = haploreadsN * c.length * 2 / ((w.end - w.start) - Ns) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulated coverage for this region will be ' + str(hapcov)) refreadsN = Nreads - haploreadsN refcov = refreadsN * c.length * 2 / ((w.end - w.start) - Ns) print( '[' + now + '][Message] Simulated coverage for the corresponding reference region will be ' + str(refcov)) mate1r = os.path.abspath(c.haplodir + '/rr1.tmp.fq') mate2r = os.path.abspath(c.haplodir + '/rr2.tmp.fq') now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulating') wgsim.core(r1=mate1h, r2=mate2h, ref=tmpfa, err_rate=c.error, mut_rate=c.mutation, indel_frac=c.indels, indel_ext=c.extindels, N=haploreadsN, dist=c.distance, stdev=c.stdev, size_l=c.length, size_r=c.length, max_n=0.05, is_hap=0, is_fixed=0, seed=0) wgsim.core(r1=mate1r, r2=mate2r, ref=tmpref, err_rate=c.error, mut_rate=c.mutation, indel_frac=c.indels, indel_ext=c.extindels, N=refreadsN, dist=c.distance, stdev=c.stdev, size_l=c.length, size_r=c.length, max_n=0.05, is_hap=0, is_fixed=0, seed=0) os.remove(tmpfa) os.remove(tmpref) mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq') mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq') with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2: for (name1, seq1, qual1), (name2, seq2, qual2) in zip(mp.fastx_read(mate1h), mp.fastx_read(mate2h)): #change name1/name2 newname1 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fh_' + name1 newname2 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fh_' + name2 read1 = [newname1, seq1, '+', qual1] read2 = [newname2, seq2, '+', qual2] out1.write('\n'.join(x for x in read1) + '\n') out2.write('\n'.join(x for x in read2) + '\n') os.remove(mate1h) os.remove(mate2h) with open(mate1hnew, 'a') as out1, open(mate2hnew, 'a') as out2: for (name1, seq1, qual1), (name2, seq2, qual2) in zip(mp.fastx_read(mate1r), mp.fastx_read(mate2r)): #change name1/name2 newname1 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fr_' + name1 newname2 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fr_' + name2 read1 = [newname1, seq1, '+', qual1] read2 = [newname2, seq2, '+', qual2] out1.write('\n'.join(read1) + '\n') out2.write('\n'.join(read2) + '\n') os.remove(mate1r) os.remove(mate2r) #split in chunks for multiprocessing else: hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulated coverage for this region will be ' + str(hapcov)) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulating') wgsim.core(r1=mate1h, r2=mate2h, ref=tmpfa, err_rate=c.error, mut_rate=c.mutation, indel_frac=c.indels, indel_ext=c.extindels, N=Nreads, dist=c.distance, stdev=c.stdev, size_l=c.length, size_r=c.length, max_n=0.05, is_hap=0, is_fixed=0, seed=0) os.remove(tmpfa) mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq') mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq') with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2: for (name1, seq1, qual1), (name2, seq2, qual2) in zip(mp.fastx_read(mate1h), mp.fastx_read(mate2h)): #change name1/name2 newname1 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fh_' + name1 newname2 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fh_' + name2 read1 = [newname1, seq1, '+', qual1] read2 = [newname2, seq2, '+', qual2] out1.write('\n'.join(read1) + '\n') out2.write('\n'.join(read2) + '\n') os.remove(mate1h) os.remove(mate2h) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Mapping simulated reads to the reference genome') BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), '-R', '@RG\\tID:illumina\\tSM:bulk', c.REF, mate1hnew, mate2hnew ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(mate1hnew) os.remove(mate2hnew)
def MolSim(processor, molecule, hfa, w, c): ''' Parallelize 10X linked reads simulation ''' for mol in molecule: moleculenumber = str(mol.seqidx + 1) moleculedroplet = str(mol.index_droplet + 1) barcodestring = str(mol.barcode) chromstart = str(w.start + mol.start) chromend = str(w.start + mol.end) header = 'MOL:' + moleculenumber + '_GEM:' + moleculedroplet + '_BAR:' + barcodestring + '_CHROM:' + w.chrom + '_START:' + chromstart + '_END:' + chromend seq__ = hfa[w.chrom][w.start + mol.start - 1:w.start + mol.end].seq truedim = mol.length - seq__.count('N') N = int(truedim * c.molcov) / (c.length * 2) R1A = os.path.abspath(c.OUT + '/SIM_S1_L' + str(c.hapnumber).zfill(3) + '_R1_001.fastq') R2A = os.path.abspath(c.OUT + '/SIM_S1_L' + str(c.hapnumber).zfill(3) + '_R2_001.fastq') if N != 0: molfa = os.path.abspath(c.OUT + '/' + processor + '_' + moleculenumber + '.fa') with open(molfa, 'w') as faout: faout.write('>' + header + '\n' + '\n'.join(re.findall('.{1,60}', seq__)) + '\n') R1tmp = os.path.abspath(c.OUT + '/' + processor + '.R1.tmp.fq') R2 = os.path.abspath(c.OUT + '/' + processor + '.R2.fq') wgsim.core(r1=R1tmp, r2=R2, ref=molfa, err_rate=c.error, mut_rate=c.mutation, indel_frac=c.indels, indel_ext=c.extindels, N=N, dist=c.distance, stdev=c.stdev, size_l=c.length - 22, size_r=c.length, max_n=0.05, is_hap=0, is_fixed=0, seed=0) os.remove(molfa) RANDOM6MER = ''.join( np.random.choice(['A', 'T', 'G', 'C', 'N'], 6, replace=True)) if os.stat(R1tmp).st_size == 0: os.remove(R1tmp) os.remove(R2) else: with open(R1tmp, 'r') as infile, open(R1A, 'a') as outfile: for name, seq, qual in readfq(infile): read = [ '@' + name, barcodestring + RANDOM6MER + seq, '+', str(qual[0]) * (22) + qual ] outfile.write('\n'.join(read) + '\n') os.remove(R1tmp) with open(R2, 'r') as infile, open(R2A, 'a') as outfile: for name, seq, qual in readfq(infile): read = ['@' + name, seq, '+', qual] outfile.write('\n'.join(read) + '\n') os.remove(R2)