def main(args): container = None for infile in args.files: obj = bioio.load(infile, options=args.io_opts or []) cout('reading %d sequences from %s' % (len(obj), infile)) if container is None: container = obj else: container += obj indexes = [] counter = 0 for s in container: counter += 1 new_label = '%04d' % counter indexes.append((new_label, s.label)) s.label = new_label if args.outfile: bioio.save(container, args.outfile, options=args.io_opts or []) if args.tabfile: with open(args.tabfile, 'w') as f: for i in indexes: f.write('%s\t%s\n' % i)
def main(args): tables = [] container = bioio.multisequence() n = 1 for infile in args.files: mseqs = bioio.load(infile) mseqs.sort(lambda x: x.label) for s in mseqs: tables.append(( n, s.label, s.attr.get('collection_date', ''), s.attr.get('country', ''), s.attr.get('isolate', ''), s.definition, )) container.append(bioio.biosequence('%04d' % n, s.seq.upper())) n += 1 # write to output file tabfile = open(args.tabfile, 'w') tabfile.write('LABEL\tACCNO\tDATE\tCOUNTRY\tISOLATE\tDEFINITION\n') tables.sort() for r in tables: tabfile.write('%04d\t%s\t%s\t%s\t%s\t%s\n' % r) tabfile.close() bioio.save(container, args.outfile)
def main(args): aaseqs = bioio.multisequence() if args.start_sequence: args.start_sequence = args.start_sequence.upper().encode('ASCII') for infile in args.files: mseq = bioio.load(infile, options=args.io_opts) cout('reading %d sequences from %s' % (len(mseq), infile)) for seq in mseq: aaseq = seq.clone() if args.start_sequence: # we use search restriction pattern function to locate # the position target_seq = funcs.uppercased(funcs.degapped(seq)) res = funcs.search_restriction_site(target_seq, args.start_sequence) if len(res) != 1: continue print(target_seq[res[0][0]:res[0][0] + 30]) aaseq.set_sequence( funcs.translated(target_seq, start_pos=res[0][0] + 1)) else: aaseq.set_sequence( funcs.translated(seq, start_pos=args.start_codon)) aaseqs.append(aaseq) bioio.save(aaseqs, args.outfile)
def main( args ): mseq = bioio.load( args.infile, options = args.io_opts ) cout('reading %d sequences from %s' % (len(mseq), args.infile)) c_mseq = funcs.condensed( mseq ) bioio.save( c_mseq, args.outfile ) if args.report: write_report(c_mseq, args.report)
def vcf2seq(args): vcf2seqhelper = VCF2SeqHelper( args.vcffile, args.chr, 'NoIndel,LowQual,MissingThreshold=0.05,HetThreshold=0.25,' + args.opts) vcf2seqhelper.parse() mseq = vcf2seqhelper.get_multisequence() cout('Report:') for k, v in vcf2seqhelper.chr_used.items(): cout(' %s\t%d' % (k, v)) cout('Writing to %s' % args.outfile) bioio.save(mseq, args.outfile)
def main(args): container = None for infile in args.files: obj = bioio.load(infile, options=args.io_opts or []) cout('reading %d sequences from %s' % (len(obj), infile)) if container is None: container = obj else: container += obj append_attributes(container, args.src, args.src_isolate, args.definition) if args.accno: set_label_to_accno(container) if args.degap: container.degap() if args.minlen > 0 or args.maxlen > 0 or args.maxN > 0: new_container = bioio.multisequence() for s in container: if args.minlen > 0 and len(s) < args.minlen: continue if args.maxlen > 0 and len(s) > args.maxlen: continue if args.maxN > 0 and s.seq.count(b'N') / len(s) > args.maxN: continue new_container.append(s) container = new_container if args.sort: if args.sort.startswith('len'): container.sort(lambda x: len(x), reverse=True) elif args.sort.startswith('lab'): container.sort(lambda x: x.label) if args.summary: for s in container: seq = s.seq.upper() print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" % (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'), seq.count(b'G'), seq.count(b'T'), seq.count(b'-'))) if args.outfile: bioio.save(container, args.outfile, options=args.io_opts or [])
def main( args ): aaseqs = bioio.multisequence() for infile in args.files: mseq = bioio.load( infile, options = args.io_opts ) cout('reading %d sequences from %s' % (len(mseq), infile)) for seq in mseq: aaseq = seq.clone() aaseq.set_sequence( funcs.translated(seq, start_pos = args.start_codon ) ) aaseqs.append( aaseq ) bioio.save( aaseqs, args.outfile )
def gather_consensus( args ): # set output directory args.outdir = args.indir + '-results' if not args.outdir else args.outdir # open input file cons = multisequence() header = None stat_lines = [] if args.add: seqs = load(args.add) cons.extend( seqs ) for indir in sorted(os.listdir(args.indir)): seqpath = os.path.join(args.indir, indir, args.consfile) print(args.indir, indir, args.consfile, seqpath) try: seqs = load(seqpath) except FileNotFoundError: cerr('[WARN: no such file: %s]' % (seqpath) ) continue cons.append( seqs[0] ) statpath = os.path.join(args.indir, indir, args.statfile) with open(statpath) as fin: lines = fin.read().split('\n') if not header: header = lines[0].strip() stat_lines.append( lines[1].strip() ) try: os.mkdir(args.outdir) except: pass save( cons, os.path.join(args.outdir, 'consensus.fas' ) ) with open( os.path.join(args.outdir, 'stats.tsv'), 'w') as fout: fout.write(header) fout.write('\n') fout.write('\n'.join(stat_lines)) cerr(f'[Writing results to directory {args.outdir}]')
def main(args): mseq = bioio.multisequence() for infile in args.files: trace = bioio.load(infile) result = traceutils.trim(trace, args.winsize, args.qual_threshold) if not result: continue bases, quals, upstream_trim, downstream_trim = result seq = bioio.biosequence(infile, bases) seq.add_attr('upstream_trim', str(upstream_trim)) seq.add_attr('downstream_trim', str(downstream_trim)) mseq.append(seq) bioio.save(mseq, args.outfile)
def main(args): circseqs = bioio.multisequence() mseq = bioio.load(args.infile, options=args.io_opts) rseq = bioio.load(args.reffile) for seq in mseq: circseq = seq.clone() if args.minlen > 0 and len(seq) > args.minlen: print('seq:', circseq.label) circseq.set_sequence( recircularize_sequence(seq.seq, rseq[0].seq, max_mismatch=args.max_mismatch)) else: circseq.set_sequence(seq.seq) circseqs.append(circseq) bioio.save(circseqs, args.outfile)
def main(args): import dendropy tree = dendropy.Tree.get(path=args.treefile, schema="newick") pdc = tree.phylogenetic_distance_matrix() cerr('Reading: %d taxa' % len(tree.taxon_namespace)) if args.collect > 0: ref_seqs = bioio.load(args.reffile) ref_taxa = [] for taxon in tree.taxon_namespace: if ref_seqs.get_by_label(taxon.label) != None: print('appended') ref_taxa.append(taxon) cerr('Referenced: %d taxa' % len(ref_taxa)) collected_taxa = set() for t1 in ref_taxa: d = [] for t2 in tree.taxon_namespace[:-1]: d.append((pdc(t1, t2), t2)) d.sort() for i in range(args.collect): collected_taxa.add(d[i][1]) collected_taxa.add(t1) cerr('Collected: %d taxa' % len(collected_taxa)) db_seqs = bioio.load(args.dbfile) mseq = bioio.multisequence() for taxon in collected_taxa: mseq.append(db_seqs.get_by_label(taxon.label)) bioio.save(mseq, args.outfile)
def main(args): circseqs = bioio.multisequence() mseq = bioio.load(args.infile, options=args.io_opts) rseq = bioio.load(args.reffile) for seq in mseq: if seq.label != 'NODE_2_length_4501_cov_41.785': continue if len(seq) < len(rseq[0]): cerr('WARNING: %s is shorter than reference' % seq.label) circseq = seq.clone() if args.minlen > 0 and len(seq) > args.minlen: print('seq:', circseq.label) circseq.set_sequence( recircularize_sequence(seq.seq, rseq[0].seq, max_mismatch=args.max_mismatch)) else: circseq.set_sequence(seq.seq) circseqs.append(circseq) bioio.save(circseqs, args.outfile)
def main(args): # read tables tables = {} tabfile = open(args.tabfile) next(tabfile) for line in tabfile: items = line.strip().split('\t') tables[items[0]] = items mseq = bioio.load(args.infile) for s in mseq: rec = tables[s.label] mo = re_date.search(rec[2]) if mo: year = mo.group() else: year = '-' #print('%s/%s/%s' % (s.label, rec[3], year)) s.label = '%s/%s' % (s.label, year) bioio.save(mseq, args.outfile)
def main(args): container = None for infile in args.files: obj = bioio.load(infile, options=args.io_opts or []) cout('reading %d sequences from %s' % (len(obj), infile)) if container is None: container = obj else: container += obj append_attributes(container, args.src, args.src_isolate, args.definition) if args.summary: for s in container: seq = s.seq.upper() print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" % (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'), seq.count(b'G'), seq.count(b'T'), seq.count(b'-'))) if args.outfile: bioio.save(container, args.outfile, options=args.io_opts or [])
def save(self, filename): bioio.save(self._msa, filename)
def main(args): mseq = bioio.load(args.infile, options=args.io_opts) cout('reading %d sequences from %s' % (len(mseq), args.infile)) bioio.save(funcs.condensed(mseq), args.outfile)
def prepare_submission(args): out_metadata = args.outprefix + '.csv' out_fasta = args.outprefix + '.fas' # open metadata file if args.metafile.lower().endswith('.csv'): separator = ',' elif args.metafile.lowe().endswith('.tsv'): separator = '\t' cerr(f'[Reading metadata file {args.metafile}]') metadata_df = pd.read_table(args.metafile, sep=separator) # make sure sequence name is a string (in case the the column is automatically # converted to number) metadata_df['fn'] = metadata_df['fn'].astype('str') metadata_df['covv_assembly_method'] = metadata_df['covv_assembly_method'].astype('str') metadata_df.set_index('fn', drop=False, inplace=True ) #import IPython; IPython.embed() # open infile tsv cerr(f'[Reading infile {args.infile}]') submission_df = pd.read_table(args.infile, sep='\t') # check for available field in submission_df code_field = 'SAMPLE' if 'SAMPLE' in submission_df.columns else 'fn' submission_df[code_field] = submission_df[code_field].astype('str') # open sequence file cerr(f'[Reading sequence file {args.seqfile}]') mseq = bioio.load( args.seqfile ) mseq_keys = {} for i in range(len(mseq)): mseq_keys[ mseq[i].label ] = i # iterate over submission_df used = [] #import IPython; IPython.embed() for (i, s) in submission_df.iterrows(): sample_id = s[code_field] r = metadata_df.loc[sample_id] if sample_id not in mseq_keys: continue cerr(f'[Preparing sample {sample_id}]') # set coverage # import IPython; IPython.embed() metadata_df.at[sample_id, 'covv_coverage'] = s['AVGDEPTH'] metadata_df.at[sample_id, 'fn'] = out_fasta metadata_df.at[sample_id, 'covv_seq_technology'] = args.covv_seq_technology metadata_df.at[sample_id, 'covv_assembly_method'] = args.covv_assembly_method # set sequence name idx = mseq_keys[sample_id] mseq[idx].label = r['covv_virus_name'] mseq[idx].seq = mseq[idx].seq.strip(b'-') used.append(sample_id) cerr(f'[Finish preparing sample {sample_id}]') # remove unused metadata metadata_df = metadata_df.loc[ used ] # write to new fasta & metadata file metadata_df.to_csv(out_metadata, sep=',', index=False) bioio.save(mseq, out_fasta)