def match_multi(fq1, fq2, primersets): for r1, r2 in zip(mp.fastx_read(fq1, read_comment=True), mp.fastx_read(fq2, read_comment=True)): r1 = Read(*r1) r2 = Read(*r2) matches = {} for pset in primersets: matches[pset.name] = Matched(r1, pset.match(r1.seq), r2, pset.match(r2.seq)) yield r1, r2, matches
def generate_coverage(read1, read2, mapping, ref, pwid=0.95, ncpu=1, chunk_size=500000, quiet=False): if not quiet: print("Building index and data structures...") seq_cov = {} for name, seq in pyfastx.Fasta(ref, build_index=False): seq_cov[name] = np.zeros(len(seq), dtype=int) nreads = 0 read_len = 0 for r in mp.fastx_read(read1): nreads+=1 read_len += len(r[1]) read_len /= nreads min_chain_score = int(0.9*read_len) min_mis_match = int(read_len-pwid*read_len) a = mp.Aligner(ref, preset='sr', n_threads=ncpu, best_n=1000, min_chain_score=min_chain_score) # load or build index if not a: raise Exception("ERROR: failed to load/build index") def mpile(seqs): if seqs is None: return([]) thrbuf = mp.ThreadBuffer() hits = [] chrom=None for hit in a.map(seqs[1], buf=thrbuf): if (hit.NM<=min_mis_match) and ('S' not in hit.cigar_str) and ('H' not in hit.cigar_str): if chrom is None: chrom=mapping[hit.ctg] hits.append((hit.ctg, hit.r_st-1, hit.r_en)) elif mapping[hit.ctg] == chrom: hits.append((hit.ctg, hit.r_st-1, hit.r_en)) else: break return(hits) if not quiet: print("Aligning reads...") pool = ThreadPool(ncpu) for reads in tqdm(grouper(chain( mp.fastx_read(read1), mp.fastx_read(read2)), chunk_size), total=int(1+2*nreads/chunk_size), disable=quiet): hits = pool.map(mpile, reads) for hit in chain.from_iterable(hits): if hit is None: continue seq_cov[hit[0]][hit[1]:hit[2]] += 1 #close the pool and wait for the work to finish pool.close() pool.join() return(seq_cov)
def main(argv): opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c") if len(args) < 2: print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>") print("Options:") print(" -x STR preset: sr, map-pb, map-ont, asm5, asm10 or splice") print(" -n INT mininum number of minimizers") print(" -m INT mininum chaining score") print(" -k INT k-mer length") print(" -w INT minimizer window length") print(" -r INT band width") print(" -c output the cs tag") sys.exit(1) preset = min_cnt = min_sc = k = w = bw = None out_cs = False for opt, arg in opts: if opt == '-x': preset = arg elif opt == '-n': min_cnt = int(arg) elif opt == '-m': min_chain_score = int(arg) elif opt == '-r': bw = int(arg) elif opt == '-k': k = int(arg) elif opt == '-w': w = int(arg) elif opt == '-c': out_cs = True a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw) if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0])) for name, seq, qual in mp.fastx_read(args[1]): # read one sequence for h in a.map(seq, cs=out_cs): # traverse hits print('{}\t{}\t{}'.format(name, len(seq), h))
def runMapper(referenceIndex, asm2Filename, minQueryLen): print("running minimap2 and finding top hit per query sequence\n") scaffoldMapList0 = [] for name, seq, qual in mp.fastx_read(asm2Filename): print("... query: %s" % name) if len(seq) < minQueryLen: print( "...... Skipping, query too short (seq len of %i is less than minimum: %i)\n" % (len(seq), minQueryLen)) continue hits = [] for hit in referenceIndex.map(seq): hits.append(name + "\t" + str(len(seq)) + "\t" + hit.ctg + "\t" + str(hit.mlen)) topAln = getTopHitByAlignmentLength(hits) print("Top hit: %s\n" % topAln['top_aln_id']) scaffoldMapList0.append({ 'queryID': name, 'qury_len': len(seq), 'refID': topAln['top_aln_id'], 'alignLen': topAln['top_aln_blen'] }) return (scaffoldMapList0)
def chunk_process(num_reads, args, blat): '''Split the input fasta into chunks and process''' if args.blatThreads: chunk_size = (num_reads // args.numThreads) + 1 else: chunk_size = args.groupSize if chunk_size > num_reads: chunk_size = num_reads pool = mp.Pool(args.numThreads) pbar = tqdm(total=num_reads // chunk_size + 1, desc='Preprocessing') iteration, current_num, tmp_reads, target = 1, 0, {}, chunk_size for read in mm.fastx_read(args.reads, read_comment=False): if len(read[1]) < args.lencutoff: continue tmp_reads[read[0]] = read[1] current_num += 1 if current_num == target: pool.apply_async(process, args=(args, tmp_reads, blat, iteration), callback=lambda _: pbar.update(1)) iteration += 1 target = chunk_size * iteration if target >= num_reads: target = num_reads tmp_reads = {} pool.close() pool.join() pbar.close() cat_files(args.out_path, 'pre_tmp_*/tmp_splint_aln.psl', args.out_path + 'tmp/splint_to_read_alignments.psl') remove_files(args.out_path, 'pre_tmp*')
def align_contigs(**kwargs): if 'infile_fasta' in kwargs: infile = kwargs['infile_fasta'] if 'out' in kwargs: outfile = kwargs['out'] if 'genome' in kwargs: genome = kwargs['genome'] if 'preset' in kwargs: preset = kwargs['preset'] if 'nthreads' in kwargs: nthreads = kwargs['nthreads'] a = mp.Aligner(str(genome), preset=preset, n_threads=nthreads) if not a: raise Exception("ERROR: failed to load/build index") outfile = open(outfile, 'w') outfile.write( "read\tchr\tpos\tr_st\tr_en\tq_st\tq_en\tq_len\tprimary\tstrand\tcs\tcigstr\tcigtup\n" ) for name, seq, qual in mp.fastx_read(infile): seq_len = len(seq) print name for hit in a.map(seq, cs=True): outfile.write( "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( name, hit.ctg, hit.r_st, hit.r_st, hit.r_en, hit.q_st, hit.q_en, seq_len, hit.is_primary, hit.strand, hit.cs, hit.cigar_str, hit.cigar)) outfile.close()
def read_subreads(seq_file, chrom_reads): for read in mm.fastx_read(seq_file, read_comment=False): root_name = read[0].split('_')[0] if root_name in chrom_reads: # root_name : [(header, seq, qual), ...] chrom_reads[root_name].append(read) # read = (header, seq, qual) return chrom_reads
def load_reference(fp): ''' only support for one contig for now out: ('EU117116.1', 'AAAATATAAAAACT...') ''' rname, rseq, _ = next(mp.fastx_read(fp)) return rname, rseq
def readFastq(seqFile): readDict={} for name,seq,qual in mappy.fastx_read(seqFile): root_name=name.split('_')[0] if root_name not in readDict: readDict[root_name]=[] readDict[root_name].append((name,seq,qual)) return readDict
def getFlankCrit(flankFa): flanks = [seq for _, seq, _ in mp.fastx_read(flankFa)] def includesFlanks(rec): a = mp.Aligner(seq=rec.query_sequence, preset='sr') return np.all([len(list(a.map(f))) == 1 for f in flanks]) return includesFlanks
def match_single(fq, primersets): for r in mp.fastx_read(fq, read_comment=True): r = Read(*r) rc = mp.revcomp(r.seq) matches = {} for pset in primersets: matches[pset.name] = Matched(r, pset.match(r.seq), r, pset.match(rc)) yield r, matches
def read_fasta(inFile, indexes): '''Reads in FASTA files, returns a dict of header:sequence''' readDict, index_dict = {}, {} for read in mm.fastx_read(inFile, read_comment=False): readDict[read[0]] = read[1] if indexes: index_dict[read[1]] = read[0] if indexes: return readDict, index_dict return readDict
def main(sequence_fasta, output): """For each sequence, deconcatenate and write to output.""" corrected = [] for name, seq, _ in mp.fastx_read(sequence_fasta): corrected.append([name, deconcatenate(seq)]) handler = get_output_handler(output) for n, s in corrected: handler.write(f">{n}\n{s}\n") handler.close()
def remove_by_alignment(fq, ref, out, mapq, preset, human_out, threads, logger): fout = smart_open(filename=out, mode="w") if human_out: hout = smart_open(filename=human_out, mode="w") else: hout = None logger.info(f"Starting to map reads against: {ref}") logger.info(f"Initiating aligner: {ref}") aligner = mp.Aligner(str(ref), preset=preset, n_threads=threads) logger.info(f"Opening file handle: {fq}") if fq: reads = mp.fastx_read(str(fq)) else: reads = None # PE ref_maps = 0 total_reads = 0 logger.info(f"Filtering mapped reads [Q >= {mapq}]") human = [] not_human = [] for name, seq, qual in reads: mapped = aligner.map(seq) for aln in mapped: if aln.mapq >= mapq: ref_maps += 1 if name not in human: human.append(name) if hout is not None: hout.write(str(f"@{name}\n{seq}\n+\n{qual}\n")) continue if name not in human: fout.write(str(f"@{name}\n{seq}\n+\n{qual}\n")) if name not in not_human: not_human.append(name) total_reads += 1 fout.close() if hout is not None: hout.close() logger.info(f"Computed {ref_maps} mappings against reference: {ref}") logger.info(f"Recovered {len(not_human)} / {total_reads} reads from {fq}")
def create_index(reference_file): aligner = mp.Aligner(reference_file, best_n=1) for name, seq, qual in mp.fastx_read(reference_file, read_comment=False): reference_names.append(name) reference_lengths[name] = len(seq) if not aligner: raise Exception("ERROR: failed to load/build index file '{}'".format( reference_file)) return aligner
def hdf_to_sam(args): """Entry point for converting guppy methylcalled fast5s to sam.""" sys.stdout.write('\t'.join(('@HD', 'VN:1.5', 'SO:unsorted'))) sys.stdout.write('\n') for name, seq, _ in mappy.fastx_read(args.reference, read_comment=False): sys.stdout.write('@SQ\tSN:{}\tLN:{}\n'.format(name, len(seq))) fast5s = get_fast5_file_list(args.path, recursive=args.recursive) worker = functools.partial(hdf_to_sam_worker, args.reference) with ProcessPoolExecutor(max_workers=args.workers) as executor: for res in executor.map(worker, fast5s): for r in res: sys.stdout.write('{}\n'.format(r))
def write_multiple_hits_to_logPAF(a, S288c_transcripts, outdir, outname, multiple_hits): ''' write transcripts that match to multiple times on the same chromosome to a log file in PAF format. Use paftools view outdir/outname.paf to visualize those alignments. ''' with open(join(outdir,outname+'_multiple_hits.log.paf'),'w') as multiple_fh: for name, seq, _, comment in mp.fastx_read(S288c_transcripts, read_comment=True): if name not in multiple_hits: continue chromosome, chr_start, chr_end, strand = location_from_comment(comment) for h in a.map(seq, cs=True): # traverse hits _, hit_chromosome = chromosome_from_hit(h.ctg) if hit_chromosome != chromosome: continue outstring = name + '\t' + str(len(seq)) + '\t' + str(h) + '\n' multiple_fh.write(outstring)
def run(): a = mp.Aligner( "/home/sonhoanghguyen/Projects/readuntil/simulation/npgraph/test/assembly_graph.fasta" ) # load or build index if not a: raise Exception("ERROR: failed to load/build index") with grpc.insecure_channel('localhost:2105') as channel: stub = npgraph_service_pb2_grpc.AssemblyGuideStub(channel) print("Connected with server at localhost:2105") for name, seq, qual in mp.fastx_read( "/home/sonhoanghguyen/Projects/readuntil/simulation/npgraph/test/E_coli_K-12_MG1655_good_long.fastq.gz" ): #1. make request request = npgraph_service_pb2.RequestAssembly() request.read_id = name for hit in a.map(seq): # traverse alignments #print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str)) request.hits_list.append( npgraph_service_pb2.AlignmentMsg(query_name=name, query_length=len(seq), query_start=hit.q_st, query_end=hit.q_en, strand=hit.strand > 0, target_name=hit.ctg, target_length=hit.ctg_len, target_start=hit.r_st, target_end=hit.r_en, quality=hit.mapq, score=hit.mlen)) #2. get and print response if len(request.hits_list) > 0: try: start_time = time.time() response = stub.GetAssemblyContribution(request) print("{}: {} in {:.5f} seconds".format( response.read_id, response.usefulness, time.time() - start_time)) except grpc.RpcError as e: print("{}: errorcode={}".format(request.read_id, str(e.code()))) continue else: print("{}: unmapped!".format(request.read_id)) continue
def mapping(self, query_path, ref_path): if os.path.isdir(query_path): file_list = os.listdir(query_path) else: file_list = [query_path] for file_name in file_list: mapper = mp.Aligner(ref_path, preset="map-ont") for name, seq, qual in mp.fastx_read(file_name): for hit in mapper.map(seq): self.names.append(name) self.cigar.append(hit.cigar_str) self.r_st.append(hit.r_st) self.r_end.append(hit.r_en) self.q_st.append(hit.q_st) self.q_end.append(hit.q_en) self.section.append(hit.ctg) self.strand.append(hit.strand)
def main(parser): args = parser.parse_args() if args.motifs: print 'Over-riding preset motifs with %s' % args.motifs motifs = args.motifs.split(',') label = args.label else: motifs = repeatPatterns[args.preset] label = args.label if args.label else args.preset aligner = mp.Aligner(args.target) #function to generate output names s = args.sample + '.' if args.sample else '' l = label + '.' if label else '' outfileName = lambda name, ext: '{d}/{s}{l}{n}.{e}'.format( d=args.outDir, s=s, l=l, n=name, e=ext) #function to write summary writeSummary = lambda: summary.to_csv(outfileName('summary', 'csv')) print 'Mapping and extracting repeat regions' repeatRegions = pd.DataFrame({ 'read': rec[0], 'subsequence': extractRepeat(rec[1], aligner) } for rec in mp.fastx_read(args.ccsFastx)) repeatRegions = repeatRegions.assign(size=repeatRegions.subsequence.map(len))\ .sort_values('size',ascending=False)\ .drop(columns='size')\ .reset_index(drop=True) #filter and summarize summary, filtered = countAlignments(repeatRegions, reference=args.target) print 'Counting repeats' try: motifDfs = [pd.concat(filtered.set_index('read',append=True).subsequence.map(getPositions(motif)).to_dict())\ .reset_index(level=2,drop=True)\ .reset_index()\ .rename(columns={'level_0':'idx','level_1':'readName'}) for motif in motifs] except ValueError, e: writeSummary() raise fastRepeatAnalysisReport_Exception('No reads map to target!')
def mappyAlign(infile, outfile): import mappy as mp a = mp.Aligner("/mnt/ix1/Resources/10X_resources/refdata-b37-2.1.0/fasta/genome.fa") if not a: raise Exception("ERROR: failed to load/build index") outfile = open(outfile, 'w') outfile.write("read\tchr\tpos\tr_st\tr_en\tq_st\tq_en\tcigstr\tcigtup\n") for name, seq, qual in mp.fastx_read(infile): # read a fasta/q sequence for hit in a.map(seq): # traverse alignments ##CORE DUMPED### on aji, but fine on tamago if ((hit.ctg).isdigit()): outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(name, hit.ctg, hit.r_st, hit.r_st, hit.r_en, hit.q_st, hit.q_en, hit.cigar_str, hit.cigar)) outfile.close()
def hdf_to_sam(args): """Entry point for converting guppy methylcalled fast5s to sam.""" logger = medaka.common.get_named_logger('ModExtract') logger.info( "NOTE: Mod. base scores are output w.r.t the sequencing direction, " "not the aligned read orientation.") extractor = Extractor(args.path, recursive=args.recursive, workers=args.io_workers) sys.stdout.write('\t'.join(('@HD', 'VN:1.5', 'SO:unsorted'))) sys.stdout.write('\n') sys.stdout.write('\t'.join( ('@CO', 'Guppy basecaller mod. base tags are stored w.r.t. the ' 'sequencing direction, they should be reversed for reads ' 'aligning to the reverse strand.\n'))) if args.reference is None: # write unaligned sam for read, tags in extractor: sam = unaligned_read(read, tags) sys.stdout.write('{}\n'.format(sam)) else: for name, seq, _ in mappy.fastx_read(args.reference, read_comment=False): sys.stdout.write('@SQ\tSN:{}\tLN:{}\n'.format(name, len(seq))) aligner = Aligner(args.reference, preset='map-ont', n_threads=args.workers) def _write(future): try: sam = future.result() if sam is not None: sys.stdout.write('{}\n'.format(sam)) except Exception: pass # https://bugs.python.org/issue27144 future._result = None with ThreadPoolExecutor(max_items=args.workers, max_workers=args.workers) as executor: for read, tags in extractor: future = executor.submit(aligner.map, read, tags) future.add_done_callback(_write)
def read_quality( read, ref_fasta, result_dict={ 'nb_mappings': [], 'matches': 0, 'mismatches': 0, 'deletions': 0, 'insertions': 0, 'mapping_quality': [] }): """ Saves read quality of FASTQ/A reads to file. Args: reads -- str, path to read in FASTA or FAST5 format ref_fasta -- str, path to FASTA file containing reference output -- str, name of output file Returns: None """ aligner = mp.Aligner(ref_fasta) # constructor that indexes reference for name, seq, qual in mp.fastx_read( read): # generator that open FASTA/Q and yiels name, seq, qual nb_hits = 0 for hit in aligner.map( seq ): # aligns seq against index (generates Alignment object that describe alignment) if hit.is_primary: # usually best and first matches_mismatches = sum( [c[0] for c in hit.cigar if c[1] == 0]) # from CIGAR result_dict['matches'] += hit.mlen result_dict['mismatches'] += matches_mismatches - hit.mlen result_dict['insertions'] += sum( [c[0] for c in hit.cigar if c[1] == 1]) result_dict['deletions'] += sum( [c[0] for c in hit.cigar if c[1] == 2]) result_dict['mapping_quality'].append(hit.mapq) nb_hits += 1 result_dict['nb_mappings'].append(nb_hits) return result_dict
def run_polyte(reffile, r1name, fname, output_type, cut_site, min_len): '''Align filtered fastq to genome ''' reference = mp.Aligner(reffile, preset="sr") print("Load in reference...") # load or build index if not reference: raise Exception("ERROR: failed to load/build index") print("Done") output_sam = SAMBAMWriter(fname, reference, output_type) print("Running alignment...") reads1 = mp.fastx_read(r1name) while True: try: read1 = process_reads2.Read(reads1.__next__()) read1.split_read(cut_site, min_len) read1.qual_trim(10, 10) if read1.seq: res = map_te_reads(read1, reference) if res: output_sam.process_te_output(res, read1) except StopIteration: break
def read_fastq_file(seq_file, check): ''' Takes a FASTQ file and returns a list of tuples In each tuple: name : str, read ID seed : int, first occurrence of the splint seq : str, sequence qual : str, quality line average_quals : float, average quality of that line seq_length : int, length of the sequence Has a check mode where if it sees one read, it'll return True ''' read_list = [] for read in mm.fastx_read(seq_file, read_comment=False): split_name = read[0].split('_') name, seed = split_name[0], 0 seq, qual = read[1], read[2] if check: return True avg_q = np.average([ord(x) - 33 for x in qual]) s_len = len(seq) read_list.append((read[0], seq, qual, avg_q, s_len)) return read_list
def main(argv): """ Main PAtChER function """ distance = 10 nthreads = 1 cut_site = "GATC" min_len = 20 output_type = "SAM" debug = False myargs = getopts(argv) if '-g' in myargs: reffile = myargs["-g"] else: print_help() if '-o' in myargs: fname = myargs[ "-o"] / var / folders / sm / tmsr4vt95wsbb57yrr6szm9w0000gn / T / com.apple.iChat / Messages / Transfers / IMG_4777.PNG else: print_help() if '-r1' in myargs: r1name = myargs["-r1"] reads1 = mp.fastx_read(r1name) else: print_help() if '-r2' in myargs: r2name = myargs["-r2"] reads2 = mp.fastx_read(r2name) else: print_help() if '-d' in myargs: distance = int(myargs["-d"]) if '-D' in myargs: debug = True if '-t' in myargs: nthreads = int(myargs["-t"]) if '-c' in myargs: cut_site = myargs["-c"] if '-l' in myargs: min_len = int(myargs["-l"]) if '-b' in myargs: output_type = "BAM" print(f"Using refrence:{reffile}") print(f"Using read 1:{r1name}") print(f'Using read 2:{r2name}') print(f'Using distance +/-:{distance}') print(f"Using threads:{nthreads}") print(f"Using cutsite:{cut_site}") print(f"Writing to:{fname}") if nthreads > 3: multiproc2.run(reffile, reads1, reads2, fname, distance, nthreads, cut_site, min_len, output_type, debug) else: if nthreads > 1: print( "Cannot run multithreading with less than 3 threads defaulting to single" ) runsingle(reffile, reads1, reads2, fname, distance, cut_site, min_len, output_type) reads1.close() reads2.close() print("Run complete")
def _method_mappy(self, *args, **kwargs): with open(self.outfile, "w") as fasta: for (name, seq, _) in fastx_read(self.infile): fasta.write(">{}\n{}\n".format(name, seq))
writer = csv.writer(outfile, delimiter='\t', lineterminator=os.linesep) for line in open(sys.argv[2]): # bed line = line.rstrip().split('\t') if isbed: name = line[3][:line[3].rfind(';')] else: name = line[9][:line[9].rfind(';')] if name not in assigned_names: writer.writerow(line) headers_keep.add(name) import mappy as mm headers_used = set() for fle in sys.argv[5:]: for read in mm.fastx_read(fle): header, seq, qual = read if header in headers_keep: print('>' + header) print(seq) headers_used.add(header) diff = len(headers_keep - headers_used) if diff > 0: sys.stderr.write( '{} names do not match any names in fastq file(s)'.format(diff)) sys.stderr.write('e.g. {} in bed but not in fastq\n'.format( list(headers_keep - headers_used)[0])) sys.exit(1)
def StrandSim(w, c): ''' Perform first part of strand-seq simulations and re-align to the original haplotype ''' hfa = pyfaidx.Fasta(c.ffile) if w.chrom not in hfa.keys(): now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Warning] Chromosome ' + w.chrom + ' not found in ' + c.ffile + '. Skipped simulation') else: now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Preparing simulation from ' + c.ffile + '. Haplotype ' + str(c.hapnumber)) chr_ = hfa[w.chrom] seq_ = chr_[w.start - 1:w.end].seq tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa') region = w.chrom + '_' + str(w.start) + '_' + str(w.end) with open(tmpfa, 'w') as tmpfout: #write temporary fa for sampling reads tmpfout.write('>' + region + '\n' + '\n'.join(re.findall('.{1,60}', seq_)) + '\n') Ns = seq_.count('N') #normalize coverage on Ns Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) / 2) #for paired-end sequencing mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq') mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq') hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulated coverage for this region will be ' + str(hapcov)) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulating') wgsim.core(r1=mate1h, r2=mate2h, ref=tmpfa, err_rate=c.error, mut_rate=c.mutation, indel_frac=c.indels, indel_ext=c.extindels, N=Nreads, dist=c.distance, stdev=c.stdev, size_l=c.length, size_r=c.length, max_n=0.05, is_hap=0, is_fixed=0, seed=0) os.remove(tmpfa) mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq') mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq') with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2: for (name1, seq1, qual1), (name2, seq2, qual2) in zip(mp.fastx_read(mate1h), mp.fastx_read(mate2h)): #change name1/name2 newname1 = '@c' + str(c.singlecellnum) + 'h' + str( c.hapnumber) + 'fh_' + name1 newname2 = '@c' + str(c.singlecellnum) + 'h' + str( c.hapnumber) + 'fh_' + name2 read1 = [newname1, seq1, '+', qual1] read2 = [newname2, seq2, '+', qual2] out1.write('\n'.join(read1) + '\n') out2.write('\n'.join(read2) + '\n') os.remove(mate1h) os.remove(mate2h) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Mapping simulated reads to the corresponding haplotype' ) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), c.ffile, mate1hnew, mate2hnew ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(mate1hnew) os.remove(mate2hnew) #now re-parse BAM file to keep only Watson/Crick reads #Watson reads: read1 forward, read2 reverse #Crick reads: read2 forward, read1 reverse ivf = None if len(c.sce_bedregion) != 0: sce_string = '' for s in c.sce_bedregion: if s[3] == c.cellid and s[4] == c.hapid: sce_string += s.chrom + '\t' + str(s.start) + '\t' + str( s.end) + '\n' if sce_string != '': sce_fromscratch = pybedtools.BedTool(sce_string.rstrip(), from_string=True) ivf = sce_fromscratch.as_intervalfile( ) #intervals where to perform SCE events now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Detected one ore more SCE event for current cell/haplotype' ) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Extracting Watson (R1F,R2R) and Crick (R1R,R2F) reads') save = pysam.set_verbosity(0) bamstrand = pysam.AlignmentFile( BAM, 'rb', require_index=False) #until-eof consumes the bamfile pysam.set_verbosity(save) Wreads = list(WR(bamstrand, ivf)) bamstrand.close() save = pysam.set_verbosity(0) bamstrand = pysam.AlignmentFile( BAM, 'rb', require_index=False) #re-open for second round pysam.set_verbosity(save) Creads = list(CR(bamstrand, ivf)) bamstrand.close() os.remove(BAM) if c.noise > 0: now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Adding noise to strands') CtoW = random.sample(Creads, round(len(Wreads) / 100 * c.noise)) Wreads += CtoW WtoC = random.sample(Wreads, round(len(Creads) / 100 * c.noise)) Creads += WtoC now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Writing Watson and Crick FASTQ') w1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w1.fq') w2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w2.fq') c1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c1.fq') c2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c2.fq') with open(w1, 'w') as wout1, open(w2, 'w') as wout2: for r1, r2 in Wreads: if r1.get_tag('OS') == 'W': #this is true W read1 = [ '@' + r1.query_name, r1.query_sequence, '+', '2' * c.length ] read2 = [ '@' + r2.query_name, mp.revcomp(r2.query_sequence), '+', '2' * c.length ] else: #write to Watson, but is Crick read1 = [ '@' + r1.query_name, mp.revcomp(r1.query_sequence), '+', '2' * c.length ] read2 = [ '@' + r2.query_name, r2.query_sequence, '+', '2' * c.length ] wout1.write('\n'.join(read1) + '\n') wout2.write('\n'.join(read2) + '\n') with open(c1, 'w') as cout1, open(c2, 'w') as cout2: for r1, r2 in Creads: if r1.get_tag('OS') == 'C': #this is true C read1 = [ '@' + r1.query_name, mp.revcomp(r1.query_sequence), '+', '2' * c.length ] read2 = [ '@' + r2.query_name, r2.query_sequence, '+', '2' * c.length ] else: #write to Crick, but is Watson read1 = [ '@' + r1.query_name, r1.query_sequence, '+', '2' * c.length ] read2 = [ '@' + r2.query_name, mp.revcomp(r2.query_sequence), '+', '2' * c.length ] cout1.write('\n'.join(read1) + '\n') cout2.write('\n'.join(read2) + '\n') now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Mapping Watson and Crick reads to the original reference' ) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.W.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, w1, w2 ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(w1) os.remove(w2) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.C.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, c1, c2 ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(c1) os.remove(c2)
def BulkSim(w, c): ''' Perform bulk simulations and re-align to the un-modified reference ''' hfa = pyfaidx.Fasta(c.ffile) if w.chrom not in hfa.keys(): now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Warning] Chromosome ' + w.chrom + ' not found in ' + c.ffile + '. Skipped simulation') else: now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Preparing simulation from ' + c.ffile + '. Clone ' + str(c.clonenumber) + '. Haplotype ' + str(c.hapnumber)) chr_ = hfa[w.chrom] seq_ = chr_[w.start - 1:w.end].seq tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa') region = w.chrom + '_' + str(w.start) + '_' + str(w.end) with open(tmpfa, 'w') as tmpfout: #write temporary fa for sampling reads tmpfout.write('>' + region + '\n' + '\n'.join(re.findall('.{1,60}', seq_)) + '\n') Ns = seq_.count('N') #normalize coverage on Ns Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) / 2) #for paired-end sequencing mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq') mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq') if float(w[4]) < 100.0: tmpref = os.path.abspath(c.haplodir + '/' + 'rtmp.fa') seq__ = c.refall[w.chrom][w.start - 1:w.end].seq with open(tmpref, 'w') as tmpfout: #write temporary fa for sampling reads tmpfout.write('>' + region + '\n' + '\n'.join(re.findall('.{1,60}', seq__)) + '\n') #simulate part from reference and part from haplotype haploreadsN = round(Nreads / 100 * float(w[4])) hapcov = haploreadsN * c.length * 2 / ((w.end - w.start) - Ns) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulated coverage for this region will be ' + str(hapcov)) refreadsN = Nreads - haploreadsN refcov = refreadsN * c.length * 2 / ((w.end - w.start) - Ns) print( '[' + now + '][Message] Simulated coverage for the corresponding reference region will be ' + str(refcov)) mate1r = os.path.abspath(c.haplodir + '/rr1.tmp.fq') mate2r = os.path.abspath(c.haplodir + '/rr2.tmp.fq') now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulating') wgsim.core(r1=mate1h, r2=mate2h, ref=tmpfa, err_rate=c.error, mut_rate=c.mutation, indel_frac=c.indels, indel_ext=c.extindels, N=haploreadsN, dist=c.distance, stdev=c.stdev, size_l=c.length, size_r=c.length, max_n=0.05, is_hap=0, is_fixed=0, seed=0) wgsim.core(r1=mate1r, r2=mate2r, ref=tmpref, err_rate=c.error, mut_rate=c.mutation, indel_frac=c.indels, indel_ext=c.extindels, N=refreadsN, dist=c.distance, stdev=c.stdev, size_l=c.length, size_r=c.length, max_n=0.05, is_hap=0, is_fixed=0, seed=0) os.remove(tmpfa) os.remove(tmpref) mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq') mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq') with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2: for (name1, seq1, qual1), (name2, seq2, qual2) in zip(mp.fastx_read(mate1h), mp.fastx_read(mate2h)): #change name1/name2 newname1 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fh_' + name1 newname2 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fh_' + name2 read1 = [newname1, seq1, '+', qual1] read2 = [newname2, seq2, '+', qual2] out1.write('\n'.join(x for x in read1) + '\n') out2.write('\n'.join(x for x in read2) + '\n') os.remove(mate1h) os.remove(mate2h) with open(mate1hnew, 'a') as out1, open(mate2hnew, 'a') as out2: for (name1, seq1, qual1), (name2, seq2, qual2) in zip(mp.fastx_read(mate1r), mp.fastx_read(mate2r)): #change name1/name2 newname1 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fr_' + name1 newname2 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fr_' + name2 read1 = [newname1, seq1, '+', qual1] read2 = [newname2, seq2, '+', qual2] out1.write('\n'.join(read1) + '\n') out2.write('\n'.join(read2) + '\n') os.remove(mate1r) os.remove(mate2r) #split in chunks for multiprocessing else: hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulated coverage for this region will be ' + str(hapcov)) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulating') wgsim.core(r1=mate1h, r2=mate2h, ref=tmpfa, err_rate=c.error, mut_rate=c.mutation, indel_frac=c.indels, indel_ext=c.extindels, N=Nreads, dist=c.distance, stdev=c.stdev, size_l=c.length, size_r=c.length, max_n=0.05, is_hap=0, is_fixed=0, seed=0) os.remove(tmpfa) mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq') mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq') with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2: for (name1, seq1, qual1), (name2, seq2, qual2) in zip(mp.fastx_read(mate1h), mp.fastx_read(mate2h)): #change name1/name2 newname1 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fh_' + name1 newname2 = '@c' + str(c.clonenumber) + 'h' + str( c.hapnumber) + 'fh_' + name2 read1 = [newname1, seq1, '+', qual1] read2 = [newname2, seq2, '+', qual2] out1.write('\n'.join(read1) + '\n') out2.write('\n'.join(read2) + '\n') os.remove(mate1h) os.remove(mate2h) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Mapping simulated reads to the reference genome') BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), '-R', '@RG\\tID:illumina\\tSM:bulk', c.REF, mate1hnew, mate2hnew ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(mate1hnew) os.remove(mate2hnew)