def main(fasta_filename, csv_filename): d = LazyFastaReader(fasta_filename) pCS, orphans = read_seq_csv(csv_filename) # detect PCR chimeras from orphans chimeras = detect_PCR_chimeras(orphans, d) orphans = orphans.difference(chimeras) FileIO.write_seqids_to_fasta(orphans, "preCluster_out.orphans.fasta", d) FileIO.write_seqids_to_fasta(chimeras, "preCluster_out.chimeras.fasta", d) infof = open('preCluster.cluster_info.csv', 'w') infof.write("cluster,size\n") # write out a directory per preCluster cid in preCluster_out/<cid> # Liz note: right now, write out even directories with just 1 sequence # (we know they have "tucked" support, so can run Partial/Arrow on it) #singlef = open("preCluster_out.singles.fasta", 'w') for cid in pCS.S: # if pCS.S[cid].size == 1: # r = d[pCS.S[cid].members[0]] # singlef.write(">{0}\n{1}\n".format(r.id, r.seq)) # else: #print >> sys.stderr, "cid", cid if True: dirname = os.path.join("preCluster_out", str(cid)) os.makedirs(dirname) file = os.path.join(dirname, 'isoseq_flnc.fasta') FileIO.write_seqids_to_fasta(pCS.S[cid].members, file, d) infof.write("{0},{1}\n".format(cid, len(pCS.S[cid].members))) #print cid, len(pCS.S[cid].members) #singlef.close() infof.close()
def chunk_collected_fasta_pickle(combined_fasta, combined_uc, combined_refs, num_chunks, chunk_prefix): d = LazyFastaReader(combined_fasta, seqid_extraction=lambda x: x.split('/')[0]) # the seqids in combined_fasta are b0_c0/2/1776, need to make b0_c0 also key uc = combined_uc refs = combined_refs keys = list(uc.keys()) keys.sort() n = len(keys) / num_chunks + 1 for i in range(num_chunks): _from = i * n _to = min(len(keys), (i + 1) * n) with open("{0}.chunk{1}.consensus.fasta".format(chunk_prefix, i), 'w') as f: for seqid in keys[_from:_to]: r = d[seqid] f.write(">{0}\n{1}\n".format(r.id, r.seq)) with open("{0}.chunk{1}.pickle".format(chunk_prefix, i), 'w') as f: dump( { 'uc': dict((k, uc[k]) for k in keys[_from:_to]), 'refs': dict((k, refs[k]) for k in keys[_from:_to]) }, f)
def create_seed_n_batch_files(input='isoseq_flnc.fasta', fasta_d=None, seed_filename='seed0.fasta', batch_pre='batch'): if fasta_d is None: fasta_d = LazyFastaReader(input) batch_files = [] lens = [(r.id, len(r.seq)) for r in SeqIO.parse(open(input), 'fasta')] lens.sort(key=lambda x: x[1], reverse=True) n = len(lens) # start at 1% of the data starting_seed_index = n * 1 / 100 good = [ x[0] for x in lens[starting_seed_index:starting_seed_index + NUM_SEQS_PER_BATCH] ] write_seqids_to_fasta(good, seed_filename, fasta_d) batch_index = 1 starting_index = starting_seed_index + NUM_SEQS_PER_BATCH while starting_index < n: write_seqids_to_fasta([x[0] for x in lens[starting_index:starting_index+NUM_SEQS_PER_BATCH]], \ "{0}{1}.fasta".format(batch_pre, batch_index), fasta_d) starting_index += NUM_SEQS_PER_BATCH batch_index += 1 batch_files.append("{0}{1}.fasta".format(batch_pre, batch_index)) write_seqids_to_fasta([x[0] for x in lens[:starting_seed_index]], "{0}{1}.fasta".format(batch_pre, batch_index), fasta_d) return batch_index + 1
def write_select_seqs_to_fasta(fasta_filename, seqids, output_filename, mode='w'): d = LazyFastaReader('isoseq_flnc.fasta') with open(output_filename, mode) as f: r = d[x] f.write(">{0}\n{1}\n".format(r.id, r.seq))
def main(cpus, dun_make_bins=False, dun_use_partial=False, num_seqs_per_batch=100000, dun_cleanup_files=False): print "Indexing isoseq_flnc.fasta using LazyFastaReader..." d = LazyFastaReader('isoseq_flnc.fasta') print "Splitting input isoseq_flnc.fasta into seed/batches..." num_batchs = create_seed_n_batch_files(input='isoseq_flnc.fasta', fasta_d=d, seed_filename='seed0.fasta', batch_pre='batch', num_seqs_per_batch=num_seqs_per_batch) # step1. run minimap of seed0 against itself and process o = ar.run_minimap('seed0.fasta', 'seed0.fasta', cpus=cpus) seqids = set([r.id for r in SeqIO.parse(open('seed0.fasta'),'fasta')]) pCS, orphans = sp.process_self_align_into_seed(o, seqids, MiniReader, dun_use_partial=dun_use_partial) # keep stats size_S, size_tucked, size_orphans = len(pCS.S), sum(v=='T' for v in pCS.seq_stat.itervalues()), len(orphans) print "seed 0 initial: S {0}, tucked {1}, orphans {2}".format(size_S, size_tucked, size_orphans) # write out seed1.S.fasta and seed1.orphans.fasta FileIO.write_preClusterSet_to_fasta(pCS, 'seed1.S.fasta', d) FileIO.write_seqids_to_fasta(orphans, 'seed1.orphans.fasta', d) # step 2a. minimap batch1 against seed1.S and process for i in xrange(1, num_batchs): pCS, orphans = add_batch(i, pCS, orphans, d, cpus=cpus, dun_use_partial=dun_use_partial) cleanup_precluster_intermediate_files(i) # detect PCR chimeras from orphans chimeras = detect_PCR_chimeras(orphans, d) orphans = orphans.difference(chimeras) FileIO.write_seqids_to_fasta(orphans, "preCluster_out.orphans.fasta", d) FileIO.write_seqids_to_fasta(chimeras, "preCluster_out.chimeras.fasta", d) tucked_seqids = [] # dump pCS, orphans, chimeras to a pickle # can't dump yet --- since pCS is an object #with open('preCluster.output.pickle', 'w') as f: # dump({'pCS': pCS, 'chimeras': chimeras, 'orphans': orphans}, f) # write CSV file with open('preCluster.output.csv', 'w') as f: f.write("seqid,stat\n") for x, stat in pCS.seq_stat.iteritems(): if stat == 'T': f.write("{0},tucked\n".format(x)) tucked_seqids.append(x) elif stat == 'M': f.write("{0},{1}\n".format(x, pCS.seq_map[x])) for x in orphans: f.write("{0},orphan\n".format(x)) for x in chimeras: f.write("{0},chimera\n".format(x)) # Liz: currently not using tucked... #FileIO.write_seqids_to_fasta(tucked_seqids, "preCluster_out.tucked.fasta", d) infof = open('preCluster.cluster_info.csv', 'w') infof.write("cluster,size\n") # write out a directory per preCluster cid in preCluster_out/<cid> # Liz note: right now, write out even directories with just 1 sequence # (we know they have "tucked" support, so can run Partial/Arrow on it) #singlef = open("preCluster_out.singles.fasta", 'w') for cid in pCS.S: # if pCS.S[cid].size == 1: # r = d[pCS.S[cid].members[0]] # singlef.write(">{0}\n{1}\n".format(r.id, r.seq)) # else: if True: if not dun_make_bins: dirname = os.path.join("preCluster_out", str(cid)) os.makedirs(dirname) file = os.path.join(dirname, 'isoseq_flnc.fasta') FileIO.write_seqids_to_fasta(pCS.S[cid].members, file, d) infof.write("{0},{1}\n".format(cid, len(pCS.S[cid].members))) #singlef.close() infof.close() if not dun_cleanup_files: # clean up all seed* and batch* files for file in glob.glob('batch*fasta*'): os.remove(file) for file in glob.glob('seed*fasta*'): os.remove(file)
snps_filename), file=sys.stderr) sys.exit(-1) if not os.path.exists(filename): print("{0} does not exist! Abort.".format(filename), file=sys.stderr) sys.exit(-1) snps_files.append(filename) if not os.path.exists(genome_filename): print("Genome file {0} does not exist!".format(genome_filename), file=sys.stderr) print("Reading genome file {0}....".format(genome_filename), file=sys.stderr) genome_d = LazyFastaReader(genome_filename) # quick checking if the genome chromosomes have the |arrow|arrow style suffix, if they do, process it keys = list(genome_d.keys()) for k in keys: k2 = k.split('|')[0] if k2 != k and k2 not in keys: genome_d.d[k2] = genome_d.d[k] print( "Detected | string in chromosome ID, stripping {0} to {1}....". format(k, k2), file=sys.stderr) print("Finished reading genome.", file=sys.stderr) for snp_file in snps_files: assert snp_file.endswith('.snps')
def pick_rep(fa_fq_filename, sam_filename, gff_filename, group_filename, output_filename, fusion_candidate_ranges, is_fq=False): """ For each group, select the representative record Always pick the longest one! """ if is_fq: fd = LazyFastqReader(fa_fq_filename) fout = open(output_filename, 'w') else: fd = LazyFastaReader(fa_fq_filename) fout = open(output_filename, 'w') rep_info = {} id_to_rep = {} for line in open(group_filename): pb_id, members = line.strip().split('\t') print("Picking representative sequence for", pb_id, file=sys.stdout) best_id = None best_seq = None max_len = 0 for x in members.split(','): if len(fd[x].seq) >= max_len: best_id = x best_seq = fd[x].seq best_qual = fd[x].letter_annotations[ 'phred_quality'] if is_fq else None max_len = len(fd[x].seq) rep_info[pb_id] = (best_id, best_seq, best_qual) id_to_rep[best_id] = pb_id f_gff = open(gff_filename, 'w') coords = {} record_storage = { } # temporary storage for the .1 record to write in conjunction with second record for r in BioReaders.GMAPSAMReader(sam_filename, True): if r.qID in id_to_rep: pb_id = id_to_rep[r.qID] # make coordinates & write the SAM file isoform_index = get_isoform_index(fusion_candidate_ranges[r.qID], r.sID, r.sStart, r.sEnd) if r.qID not in coords: coords[r.qID] = [None] * len(fusion_candidate_ranges[r.qID]) record_storage[pb_id] = [None] * len( fusion_candidate_ranges[r.qID]) coords[r.qID][isoform_index] = "{0}:{1}-{2}({3})".format( r.sID, r.sStart, r.sEnd, r.flag.strand) record_storage[pb_id][isoform_index] = r for pb_id, records in record_storage.items(): for i, r in enumerate(records): isoform_index = i + 1 f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) for s in r.segments: f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) f_gff.close() for pb_id in rep_info: best_id, best_seq, best_qual = rep_info[pb_id] _id_ = "{0}|{1}|{2}".format(pb_id, "+".join(coords[best_id]), best_id) _seq_ = best_seq if is_fq: SeqIO.write( SeqRecord(_seq_, id=_id_, letter_annotations={'phred_quality': best_qual}), fout, 'fastq') else: SeqIO.write(SeqRecord(_seq_, id=_id_), fout, 'fasta')
def pick_rep(fa_fq_filename, sam_filename, gff_filename, group_filename, output_filename, is_fq=False, pick_least_err_instead=False): """ For each group, select the representative record If is FASTA file (is_fa False) -- then always pick the longest one If is FASTQ file (is_fq True) -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ if is_fq: fd = LazyFastqReader(fa_fq_filename) fout = open(output_filename, 'w') else: fd = LazyFastaReader(fa_fq_filename) fout = open(output_filename, 'w') # for line in open(gff_filename): # # ex: chr1 PacBio transcript 27567 29336 . - . gene_id "PBfusion.1"; transcript_id "PBfusion.1.1"; # raw = line.strip().split('\t') # if raw[2] == 'transcript': # # check if this is first or 2+ part of fusion # tid = raw[-1].split('; ')[1].split()[1][1:-2] # ex: tid = PBfusion.1.1 # gid = tid[:tid.rfind('.')] # ex: gid = PBfusion.1 # if tid.endswith('.1'): # coords[gid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) # else: # assert gid in coords # coords[gid] += "+{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) rep_info = {} id_to_rep = {} for line in open(group_filename): pb_id, members = line.strip().split('\t') print >> sys.stderr, "Picking representative sequence for", pb_id best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members.split(','): if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].letter_annotations['phred_quality']) if (is_fq and pick_least_err_instead and err < best_err) or ( (not is_fq or not pick_least_err_instead) and len(fd[x].seq) >= max_len): best_id = x best_seq = fd[x].seq if is_fq: best_qual = fd[x].letter_annotations['phred_quality'] best_err = err max_len = len(fd[x].seq) rep_info[pb_id] = (best_id, best_seq, best_qual) id_to_rep[best_id] = pb_id f_gff = open(gff_filename, 'w') coords = {} record_storage = { } # temporary storage for the .1 record to write in conjunction with second record for r in BioReaders.GMAPSAMReader(sam_filename, True): if r.qID in id_to_rep: pb_id = id_to_rep[r.qID] best_id, best_seq, best_qual = rep_info[pb_id] # make coordinates & write the SAM file if r.qID not in coords: # this is the .1 portion coords[r.qID] = "{0}:{1}-{2}({3})".format( r.sID, r.sStart, r.sEnd, r.flag.strand) isoform_index = 1 record_storage[pb_id] = [r] else: # this is the .2 portion, or even .3, .4....! handle fusions with > 2 loci correctly coords[r.qID] += "+{0}:{1}-{2}({3})".format( r.sID, r.sStart, r.sEnd, r.flag.strand) record_storage[pb_id].append(r) for pb_id, records in record_storage.iteritems(): for i, r in enumerate(records): isoform_index = i + 1 f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) for s in r.segments: f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) f_gff.close() for pb_id in rep_info: best_id, best_seq, best_qual = rep_info[pb_id] _id_ = "{0}|{1}|{2}".format(pb_id, coords[best_id], best_id) _seq_ = best_seq if is_fq: SeqIO.write( SeqRecord(_seq_, id=_id_, letter_annotations={'phred_quality': best_qual}), fout, 'fastq') else: SeqIO.write(SeqRecord(_seq_, id=_id_), fout, 'fasta')
def write_snp_to_vcf(snp_filename, vcf_filename, genome_filename, genome_d=None): # read the genome is genome_d is not given if genome_d is None: genome_d = LazyFastaReader(genome_filename) # read the first SNP record so we know the query name snp_reader = SNPReader(snp_filename) snp_rec = next(snp_reader) sample_name = snp_rec.query_name cur_recs = [snp_rec] genome_rec = genome_d[snp_rec.ref_name] with open('template.vcf', 'w') as f: f.write(__VCF_EXAMPLE__ + '\n') reader = vcf.VCFReader(open('template.vcf')) reader.samples = [sample_name] f_vcf = vcf.Writer(open(vcf_filename, 'w'), reader) for r1 in snp_reader: if r1.ref_pos == cur_recs[-1].ref_pos: # multi-nt insertion, keep recording cur_recs.append(r1) elif r1.query_base == '.' and cur_recs[-1].query_base == '.': # multi-nt deletion, keep recording cur_recs.append(r1) else: # time to write out the current set of records # multiple records mean it could be: # 1. multi-nucleotide insertions # 2. multi-nucleotide deletions if len(cur_recs) == 1 and cur_recs[0].ref_base!='.' and cur_recs[0].query_base!='.': # just a SNP record pos = cur_recs[0].ref_pos ref_base = cur_recs[0].ref_base alt_base = cur_recs[0].query_base elif cur_recs[0].ref_base=='.': # is a single or multi-nt insertions, must retrieve ref base from genome # ex: in out.snps_files it is . --> ATG # in VCF it should be T --> TATG (meaning insertion of ATG) pos = cur_recs[0].ref_pos ref_base = genome_rec[cur_recs[0].ref_pos] alt_base = ref_base + "".join(r.query_base for r in cur_recs) else: # is a single multi-nt deletions, we need to get one more ref base before the first deletion # ex: in out.snps_files it is GGG --> deletion # in VCF it should be TGGG --> T (meaning deletion of GGG) pos = cur_recs[0].ref_pos-1 ref_base_prev = genome_rec[pos] ref_base = ref_base_prev + "".join(r.ref_base for r in cur_recs) alt_base = ref_base_prev rec = vcf.model._Record(CHROM=snp_rec.ref_name, POS=pos+1, ID='.', REF=ref_base, ALT=[vcf.model._Substitution(alt_base)], QUAL='.', FILTER='PASS', INFO={'AF':0.5}, FORMAT="GT", sample_indexes=None) samp_ft = vcf.model.make_calldata_tuple(['GT']) rec.samples.append(vcf.model._Call(rec, sample_name, samp_ft(*["0|1"]))) f_vcf.write_record(rec) if r1.ref_name != cur_recs[0].ref_name: genome_rec = genome_d[r1.ref_name] cur_recs = [r1] f_vcf.close()
__author__ = 'lachesis' import os, sys from Bio import SeqIO from cupcake.io.SeqReaders import LazyFastaReader from cupcake2.io.FileIO import write_seqids_to_fasta input = 'isoseq_flnc.fasta' NUM_SEQS_PER_BATCH = 200000 d = LazyFastaReader(input) lens = [(r.id, len(r.seq)) for r in SeqIO.parse(open(input), 'fasta')] lens.sort(key=lambda x: x[1], reverse=True) n = len(lens) # start at 1% of the data starting_seed_index = n * 1 / 100 good = [x[0] for x in lens[starting_seed_index:starting_seed_index+NUM_SEQS_PER_BATCH]] write_seqids_to_fasta(good, 'seed0.fasta', d) batch_index = 1 starting_index = starting_seed_index+NUM_SEQS_PER_BATCH while starting_index < n: write_seqids_to_fasta([x[0] for x in lens[starting_index:starting_index+NUM_SEQS_PER_BATCH]], \ "batch{0}.fasta".format(batch_index), d) starting_index += NUM_SEQS_PER_BATCH batch_index += 1
# sanity checking of input files for line in open(snps_filename): filename = line.strip() if not filename.endswith('.snps'): print >> sys.stderr, "Input files listed in {0} must end with .snps_files!".format(snps_filename) sys.exit(-1) if not os.path.exists(filename): print >> sys.stderr, "{0} does not exist! Abort.".format(filename) sys.exit(-1) snps_files.append(filename) if not os.path.exists(genome_filename): print >> sys.stderr, "Genome file {0} does not exist!".format(genome_filename) print >> sys.stderr, "Reading genome file {0}....".format(genome_filename) genome_d = LazyFastaReader(genome_filename) # quick checking if the genome chromosomes have the |arrow|arrow style suffix, if they do, process it keys = genome_d.keys() for k in keys: k2 = k.split('|')[0] if k2!=k and k2 not in keys: genome_d.d[k2] = genome_d.d[k] print >> sys.stderr, "Detected | string in chromosome ID, stripping {0} to {1}....".format(k, k2) print >> sys.stderr, "Finished reading genome." for snp_file in snps_files: assert snp_file.endswith('.snps') vcf_file = snp_file[:-5] + '.vcf' print >> sys.stderr, "Processing {0} --> {1}".format(snp_file, vcf_file) write_snp_to_vcf(snp_file, vcf_file, genome_filename, genome_d)