def fas2table(args): msa = load(args.infile) ref = load(args.reffile) table = generate_table(msa, ref) with open(args.outfile, 'w') as fout: for (label, muts) in table: fout.write('%s/\t%s\n' % (label, ' '.join(muts))) cerr('[Writing table to %s]' % args.outfile)
def map_sequences(): contigs = bioio.load(contigsfile) rseq = bioio.load(args.reffile) for contig in contigs: # map contig to ref sequence start, end, mismatch, _, _ = map_sequence(contig, ref, max_mismatch) if start < 0: contig = funcs.reverse_complemented(contig) start, end, mismatch, _, _ = map_sequence(contig, ref, max_mismatch) if start < 0: continue
def main(args): tables = [] container = bioio.multisequence() n = 1 for infile in args.files: mseqs = bioio.load(infile) mseqs.sort(lambda x: x.label) for s in mseqs: tables.append(( n, s.label, s.attr.get('collection_date', ''), s.attr.get('country', ''), s.attr.get('isolate', ''), s.definition, )) container.append(bioio.biosequence('%04d' % n, s.seq.upper())) n += 1 # write to output file tabfile = open(args.tabfile, 'w') tabfile.write('LABEL\tACCNO\tDATE\tCOUNTRY\tISOLATE\tDEFINITION\n') tables.sort() for r in tables: tabfile.write('%04d\t%s\t%s\t%s\t%s\t%s\n' % r) tabfile.close() bioio.save(container, args.outfile)
def main(args): container = None for infile in args.files: obj = bioio.load(infile, options=args.io_opts or []) cout('reading %d sequences from %s' % (len(obj), infile)) if container is None: container = obj else: container += obj indexes = [] counter = 0 for s in container: counter += 1 new_label = '%04d' % counter indexes.append((new_label, s.label)) s.label = new_label if args.outfile: bioio.save(container, args.outfile, options=args.io_opts or []) if args.tabfile: with open(args.tabfile, 'w') as f: for i in indexes: f.write('%s\t%s\n' % i)
def main(args): aaseqs = bioio.multisequence() if args.start_sequence: args.start_sequence = args.start_sequence.upper().encode('ASCII') for infile in args.files: mseq = bioio.load(infile, options=args.io_opts) cout('reading %d sequences from %s' % (len(mseq), infile)) for seq in mseq: aaseq = seq.clone() if args.start_sequence: # we use search restriction pattern function to locate # the position target_seq = funcs.uppercased(funcs.degapped(seq)) res = funcs.search_restriction_site(target_seq, args.start_sequence) if len(res) != 1: continue print(target_seq[res[0][0]:res[0][0] + 30]) aaseq.set_sequence( funcs.translated(target_seq, start_pos=res[0][0] + 1)) else: aaseq.set_sequence( funcs.translated(seq, start_pos=args.start_codon)) aaseqs.append(aaseq) bioio.save(aaseqs, args.outfile)
def seq2fst(args): # open and read sequence file cerr('[I - reading sequence file %s]' % args.infile) seqs = load(args.infile) # open and read group/meta file using groupfile/metafile if available if args.groupfile or args.metafile: cerr('[I - reading group information file]') group_parser = grpparser.GroupParser(args) group_parser.parse() group_seqs = {} for seq in seqs: try: grp = group_parser.group_info[seq.label.decode('ASCII')] except KeyError: cerr('[W - sample %s is not assign to any group]' % seq.label.decode('ASCII')) continue if grp in group_seqs: group_seqs[grp].append(seq) else: ms = multisequence() ms.append(seq) group_seqs[grp] = ms else: cexit('[ERR - seq2fst.py requires group information!]') for grp_seq in group_seqs: cerr('[I - group %s has %d sample(s)]' % (grp_seq, len(group_seqs[grp_seq]))) if args.sitefile: # perform FST site-wise FST_sites = calc_site_fst(group_seqs, args.nantozero) with open(args.sitefile, 'w') as fout: for (label, mat) in FST_sites: fout.write(label) fout.write('\t') np.savetxt(fout, mat, fmt='%5.4f', delimiter='\t', newline='\t') fout.write('\n') cerr('[I - site FST written to %s]' % (args.sitefile)) return FST_mat, groups = calc_fst(group_seqs) with open(args.outfile, 'w') as fout: fout.write('\t'.join(groups)) fout.write('\n') np.savetxt(fout, FST_mat, fmt='%5.4f', delimiter='\t')
def gather_consensus( args ): # set output directory args.outdir = args.indir + '-results' if not args.outdir else args.outdir # open input file cons = multisequence() header = None stat_lines = [] if args.add: seqs = load(args.add) cons.extend( seqs ) for indir in sorted(os.listdir(args.indir)): seqpath = os.path.join(args.indir, indir, args.consfile) print(args.indir, indir, args.consfile, seqpath) try: seqs = load(seqpath) except FileNotFoundError: cerr('[WARN: no such file: %s]' % (seqpath) ) continue cons.append( seqs[0] ) statpath = os.path.join(args.indir, indir, args.statfile) with open(statpath) as fin: lines = fin.read().split('\n') if not header: header = lines[0].strip() stat_lines.append( lines[1].strip() ) try: os.mkdir(args.outdir) except: pass save( cons, os.path.join(args.outdir, 'consensus.fas' ) ) with open( os.path.join(args.outdir, 'stats.tsv'), 'w') as fout: fout.write(header) fout.write('\n') fout.write('\n'.join(stat_lines)) cerr(f'[Writing results to directory {args.outdir}]')
def main( args ): mseq = bioio.load( args.infile, options = args.io_opts ) cout('reading %d sequences from %s' % (len(mseq), args.infile)) c_mseq = funcs.condensed( mseq ) bioio.save( c_mseq, args.outfile ) if args.report: write_report(c_mseq, args.report)
def main(args): circseqs = bioio.multisequence() mseq = bioio.load(args.infile, options=args.io_opts) rseq = bioio.load(args.reffile) for seq in mseq: circseq = seq.clone() if args.minlen > 0 and len(seq) > args.minlen: print('seq:', circseq.label) circseq.set_sequence( recircularize_sequence(seq.seq, rseq[0].seq, max_mismatch=args.max_mismatch)) else: circseq.set_sequence(seq.seq) circseqs.append(circseq) bioio.save(circseqs, args.outfile)
def main(args): import dendropy tree = dendropy.Tree.get(path=args.treefile, schema="newick") pdc = tree.phylogenetic_distance_matrix() cerr('Reading: %d taxa' % len(tree.taxon_namespace)) if args.collect > 0: ref_seqs = bioio.load(args.reffile) ref_taxa = [] for taxon in tree.taxon_namespace: if ref_seqs.get_by_label(taxon.label) != None: print('appended') ref_taxa.append(taxon) cerr('Referenced: %d taxa' % len(ref_taxa)) collected_taxa = set() for t1 in ref_taxa: d = [] for t2 in tree.taxon_namespace[:-1]: d.append((pdc(t1, t2), t2)) d.sort() for i in range(args.collect): collected_taxa.add(d[i][1]) collected_taxa.add(t1) cerr('Collected: %d taxa' % len(collected_taxa)) db_seqs = bioio.load(args.dbfile) mseq = bioio.multisequence() for taxon in collected_taxa: mseq.append(db_seqs.get_by_label(taxon.label)) bioio.save(mseq, args.outfile)
def main(args): circseqs = bioio.multisequence() mseq = bioio.load(args.infile, options=args.io_opts) rseq = bioio.load(args.reffile) for seq in mseq: if seq.label != 'NODE_2_length_4501_cov_41.785': continue if len(seq) < len(rseq[0]): cerr('WARNING: %s is shorter than reference' % seq.label) circseq = seq.clone() if args.minlen > 0 and len(seq) > args.minlen: print('seq:', circseq.label) circseq.set_sequence( recircularize_sequence(seq.seq, rseq[0].seq, max_mismatch=args.max_mismatch)) else: circseq.set_sequence(seq.seq) circseqs.append(circseq) bioio.save(circseqs, args.outfile)
def main(args): container = None for infile in args.files: obj = bioio.load(infile, options=args.io_opts or []) cout('reading %d sequences from %s' % (len(obj), infile)) if container is None: container = obj else: container += obj append_attributes(container, args.src, args.src_isolate, args.definition) if args.accno: set_label_to_accno(container) if args.degap: container.degap() if args.minlen > 0 or args.maxlen > 0 or args.maxN > 0: new_container = bioio.multisequence() for s in container: if args.minlen > 0 and len(s) < args.minlen: continue if args.maxlen > 0 and len(s) > args.maxlen: continue if args.maxN > 0 and s.seq.count(b'N') / len(s) > args.maxN: continue new_container.append(s) container = new_container if args.sort: if args.sort.startswith('len'): container.sort(lambda x: len(x), reverse=True) elif args.sort.startswith('lab'): container.sort(lambda x: x.label) if args.summary: for s in container: seq = s.seq.upper() print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" % (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'), seq.count(b'G'), seq.count(b'T'), seq.count(b'-'))) if args.outfile: bioio.save(container, args.outfile, options=args.io_opts or [])
def statseq( args ): mseq = bioio.load( args.infile, options = args.io_opts or [] ) for s in mseq: seq = s.seq.upper() A_ = seq.count(b'A') C_ = seq.count(b'C') G_ = seq.count(b'G') T_ = seq.count(b'T') N_ = seq.count(b'N') d_ = seq.count(b'-') L = A_ + C_ + G_ + T_ + N_ + d_ cout('A: %3d C: %3d G: %3d T: %3d N: %3d -: %3d L: %3d | \t%s' % (A_, C_, G_, T_, N_, d_, L, s.label))
def main( args ): aaseqs = bioio.multisequence() for infile in args.files: mseq = bioio.load( infile, options = args.io_opts ) cout('reading %d sequences from %s' % (len(mseq), infile)) for seq in mseq: aaseq = seq.clone() aaseq.set_sequence( funcs.translated(seq, start_pos = args.start_codon ) ) aaseqs.append( aaseq ) bioio.save( aaseqs, args.outfile )
def main(args): mseq = bioio.multisequence() for infile in args.files: trace = bioio.load(infile) result = traceutils.trim(trace, args.winsize, args.qual_threshold) if not result: continue bases, quals, upstream_trim, downstream_trim = result seq = bioio.biosequence(infile, bases) seq.add_attr('upstream_trim', str(upstream_trim)) seq.add_attr('downstream_trim', str(downstream_trim)) mseq.append(seq) bioio.save(mseq, args.outfile)
def seq2pi(args): # open and read sequence file cerr('[I - reading sequence file %s]' % args.infile) seqs = load(args.infile) # open and read group/meta file using groupfile/metafile if available if args.groupfile or args.metafile: cerr('[I - reading group information file]') group_parser = grpparser.GroupParser(args) group_parser.parse() group_seqs = {} for seq in seqs: try: grp = group_parser.group_info[seq.label.decode('ASCII')] except KeyError: cerr('[W - sample %s is not assign to any group]' % seq.label.decode('ASCII')) continue if grp in group_seqs: group_seqs[grp].append(seq) else: ms = multisequence() ms.append(seq) group_seqs[grp] = ms else: group_seqs = {'ALL': seqs} print('Groups:') outf = open(args.outfile, 'w') if args.outfile else None if outf: outf.write('GROUP\tN\tPI\tSTDDEV\n') for g in group_seqs: avg, stddev = calc_pi(group_seqs[g]) cout(' %20s [%3d]: %f +- %f' % (g, len(group_seqs[g]), avg, stddev)) if outf: outf.write('%s\t%d\t%5.4f\t%5.4f\n' % (g, len(group_seqs[g]), avg, stddev)) if outf: cerr('[I - result written to %s' % args.outfile)
def main(args): mseq = bioio.load(args.infile, options=args.io_opts or []) print('Number of seqs: %d' % len(mseq)) # get unique haplotype and sample cluster haplotypes = {} for seq in mseq: seq_hash = sha256(seq.seq) try: haplotypes[seq_hash].append(seq.label) except KeyError: haplotypes[seq_hash] = [seq.label] print('Number of unique haplotypes: %d' % len(haplotypes)) for (idx, item) in enumerate(haplotypes.items()): k, v = item print('Haplo %d =>' % idx) for label in v: print(' %s' % label)
def main(args): # read tables tables = {} tabfile = open(args.tabfile) next(tabfile) for line in tabfile: items = line.strip().split('\t') tables[items[0]] = items mseq = bioio.load(args.infile) for s in mseq: rec = tables[s.label] mo = re_date.search(rec[2]) if mo: year = mo.group() else: year = '-' #print('%s/%s/%s' % (s.label, rec[3], year)) s.label = '%s/%s' % (s.label, year) bioio.save(mseq, args.outfile)
def main(args): container = None for infile in args.files: obj = bioio.load(infile, options=args.io_opts or []) cout('reading %d sequences from %s' % (len(obj), infile)) if container is None: container = obj else: container += obj append_attributes(container, args.src, args.src_isolate, args.definition) if args.summary: for s in container: seq = s.seq.upper() print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" % (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'), seq.count(b'G'), seq.count(b'T'), seq.count(b'-'))) if args.outfile: bioio.save(container, args.outfile, options=args.io_opts or [])
def file_open(self, filename=None): if not filename: filename, file_filter = QtWidgets.QFileDialog.getOpenFileName( self.pane(), "Open project, alignment or trace file" ) if not filename: return cout("Loading file %s" % filename) if not os.path.exists( filename ): alert('File %s does not exists. Please check your filename!' % filename) return b = progress('Opening ' + filename) b.repaint() obj = bioio.load( filename ) b.hide() del b if obj: self.view(obj) else: alert('Error reading file ' + filename +'\nUnknown file format!')
def dropEvent(self, ev): #print 'Source:', ev.source() #D( ALL, "drop source: %s" % str(ev.source()) ) if ev.mimeData().hasUrls(): url = str(ev.mimeData().urls()[0].path()) #D( ALL, "url: %s" % url ) #print "will open:", str(ev.mimeData().urls()[0]) if True: #filename = url[7:] filename = url obj = bioio.load( filename ) if hasattr(obj, 'get_sequence'): self.model().append( obj.get_sequence() ) else: self.model().add( obj ) self.model().signals().ContentUpdated.emit() return if filename.endswith('.scf') or filename.endswith('.ab1'): # this is a trace file, just grab the sequence data from seqpy.traceio import read_trace trace = bioio.load( filename ) self.model().append( bioio.sequence( trace.name(), trace.bases() ) ) else: #D( ALL, "opening file: %s" % filename ) mseq = bioio.read_sequences( filename ) self.model().add( mseq ) self._view.model().signals().ContentUpdated.emit() elif ev.source() == self._view: idx, _ = self._view.xy2coord(0, ev.pos().y()) seq = self.model()[self._dragidx] if idx < self._dragidx: self.model().delete(self._dragidx) self.model().insert(idx, seq) elif idx > self._dragidx: self.model().insert(idx, seq) self.model().delete(self._dragidx) #self.model().conn().contentUpdated.emit() elif isinstance(ev.source(), type(self._view)): src = ev.source() idx, _ = self._view.xy2coord(0, ev.pos().y()) if src.model() == self.model(): # the same model, then just use model's move method if src.dnd()._dragidx != idx: self.model().move( src.dnd()._dragidx, idx) self.model().signals().ContentUpdated.emit() else: seq = src.model().pop(src.dnd()._dragidx) #src.model().delete(src.dnd()._dragidx) self.model().insert(idx, seq) src.model().signals().ContentUpdated.emit() self.model().signals().ContentUpdated.emit() #self.model().signals().contentUpdated.emit() #src.model().signals().contentUpdated.emit() else: D( ALL, "drop event with unknown type" )
def prepare_submission(args): out_metadata = args.outprefix + '.csv' out_fasta = args.outprefix + '.fas' # open metadata file if args.metafile.lower().endswith('.csv'): separator = ',' elif args.metafile.lowe().endswith('.tsv'): separator = '\t' cerr(f'[Reading metadata file {args.metafile}]') metadata_df = pd.read_table(args.metafile, sep=separator) # make sure sequence name is a string (in case the the column is automatically # converted to number) metadata_df['fn'] = metadata_df['fn'].astype('str') metadata_df['covv_assembly_method'] = metadata_df['covv_assembly_method'].astype('str') metadata_df.set_index('fn', drop=False, inplace=True ) #import IPython; IPython.embed() # open infile tsv cerr(f'[Reading infile {args.infile}]') submission_df = pd.read_table(args.infile, sep='\t') # check for available field in submission_df code_field = 'SAMPLE' if 'SAMPLE' in submission_df.columns else 'fn' submission_df[code_field] = submission_df[code_field].astype('str') # open sequence file cerr(f'[Reading sequence file {args.seqfile}]') mseq = bioio.load( args.seqfile ) mseq_keys = {} for i in range(len(mseq)): mseq_keys[ mseq[i].label ] = i # iterate over submission_df used = [] #import IPython; IPython.embed() for (i, s) in submission_df.iterrows(): sample_id = s[code_field] r = metadata_df.loc[sample_id] if sample_id not in mseq_keys: continue cerr(f'[Preparing sample {sample_id}]') # set coverage # import IPython; IPython.embed() metadata_df.at[sample_id, 'covv_coverage'] = s['AVGDEPTH'] metadata_df.at[sample_id, 'fn'] = out_fasta metadata_df.at[sample_id, 'covv_seq_technology'] = args.covv_seq_technology metadata_df.at[sample_id, 'covv_assembly_method'] = args.covv_assembly_method # set sequence name idx = mseq_keys[sample_id] mseq[idx].label = r['covv_virus_name'] mseq[idx].seq = mseq[idx].seq.strip(b'-') used.append(sample_id) cerr(f'[Finish preparing sample {sample_id}]') # remove unused metadata metadata_df = metadata_df.loc[ used ] # write to new fasta & metadata file metadata_df.to_csv(out_metadata, sep=',', index=False) bioio.save(mseq, out_fasta)
def main(args): mseq = bioio.load(args.infile, options=args.io_opts) cout('reading %d sequences from %s' % (len(mseq), args.infile)) bioio.save(funcs.condensed(mseq), args.outfile)