def main(args): aaseqs = bioio.multisequence() if args.start_sequence: args.start_sequence = args.start_sequence.upper().encode('ASCII') for infile in args.files: mseq = bioio.load(infile, options=args.io_opts) cout('reading %d sequences from %s' % (len(mseq), infile)) for seq in mseq: aaseq = seq.clone() if args.start_sequence: # we use search restriction pattern function to locate # the position target_seq = funcs.uppercased(funcs.degapped(seq)) res = funcs.search_restriction_site(target_seq, args.start_sequence) if len(res) != 1: continue print(target_seq[res[0][0]:res[0][0] + 30]) aaseq.set_sequence( funcs.translated(target_seq, start_pos=res[0][0] + 1)) else: aaseq.set_sequence( funcs.translated(seq, start_pos=args.start_codon)) aaseqs.append(aaseq) bioio.save(aaseqs, args.outfile)
def count_allele(mseqs): full_mseqs = multisequence() for grp in mseqs: full_mseqs.extend(mseqs[grp]) na_profiles = profiles.na_profile(full_mseqs, additional_ignore=b'X') consensus_seq = na_profiles.consensus(0.1) allele_counts = {} for grp in mseqs: allele_count = np.zeros((len(consensus_seq), 2)) mseq = mseqs[grp] for j in range(len(mseq)): seq = mseq[j].seq for i in range(len(consensus_seq)): if seq[i] == b'N': allele_count[i, 0] += 1 allele_count[i, 1] += 1 elif seq[i] == b'X': continue elif seq[i] == consensus_seq[i]: allele_count[i, 0] += 2 else: allele_count[i, 1] += 2 allele_counts[grp] = allele_count return allele_counts
def copy_to_msa(self): msa = bioio.multisequence() indices = list(self._index_selections) indices.sort() for idx in indices: msa.append(self.model()[idx]) return msa
def to_genotype_array(mseqs): # genotype array is array of site vs sample vs [0,0] # 0 for the major allele, 1 for the minor allele # create matrix profile first full_mseqs = multisequence() indexes = {} idx = 0 for grp in mseqs: full_mseqs.extend(mseqs[grp]) indexes[grp] = list(range(idx, idx + len(mseqs[grp]))) idx += len(mseqs[grp]) na_profiles = profiles.na_profile(full_mseqs, additional_ignore=b'X') consensus_seq = na_profiles.consensus(0.1) genotype_array = np.zeros((len(consensus_seq), len(full_mseqs), 2), dtype=int) for i, j in itertools.product(range(len(full_mseqs)), range(len(consensus_seq))): seq = full_mseqs[0].seq if seq[j] == b'N': genotype_array[j, i] == [0, 1] elif seq[j] == b'X': genotype_array[j, i] == [-1, -1] elif seq[j] == consensus_seq[j]: genotype_array[j, i] == [0, 0] else: genotype_array[j, i] == [1, 1] return genotype_array, indexes
def main(args): tables = [] container = bioio.multisequence() n = 1 for infile in args.files: mseqs = bioio.load(infile) mseqs.sort(lambda x: x.label) for s in mseqs: tables.append(( n, s.label, s.attr.get('collection_date', ''), s.attr.get('country', ''), s.attr.get('isolate', ''), s.definition, )) container.append(bioio.biosequence('%04d' % n, s.seq.upper())) n += 1 # write to output file tabfile = open(args.tabfile, 'w') tabfile.write('LABEL\tACCNO\tDATE\tCOUNTRY\tISOLATE\tDEFINITION\n') tables.sort() for r in tables: tabfile.write('%04d\t%s\t%s\t%s\t%s\t%s\n' % r) tabfile.close() bioio.save(container, args.outfile)
def __init__(self, dna_msa, start_atg, msa_signals=None): super(TranslatedMSA, self).__init__(bioio.multisequence()) self._src_msa = dna_msa self._start_atg = start_atg self._na = False self.retranslate()
def seq2fst(args): # open and read sequence file cerr('[I - reading sequence file %s]' % args.infile) seqs = load(args.infile) # open and read group/meta file using groupfile/metafile if available if args.groupfile or args.metafile: cerr('[I - reading group information file]') group_parser = grpparser.GroupParser(args) group_parser.parse() group_seqs = {} for seq in seqs: try: grp = group_parser.group_info[seq.label.decode('ASCII')] except KeyError: cerr('[W - sample %s is not assign to any group]' % seq.label.decode('ASCII')) continue if grp in group_seqs: group_seqs[grp].append(seq) else: ms = multisequence() ms.append(seq) group_seqs[grp] = ms else: cexit('[ERR - seq2fst.py requires group information!]') for grp_seq in group_seqs: cerr('[I - group %s has %d sample(s)]' % (grp_seq, len(group_seqs[grp_seq]))) if args.sitefile: # perform FST site-wise FST_sites = calc_site_fst(group_seqs, args.nantozero) with open(args.sitefile, 'w') as fout: for (label, mat) in FST_sites: fout.write(label) fout.write('\t') np.savetxt(fout, mat, fmt='%5.4f', delimiter='\t', newline='\t') fout.write('\n') cerr('[I - site FST written to %s]' % (args.sitefile)) return FST_mat, groups = calc_fst(group_seqs) with open(args.outfile, 'w') as fout: fout.write('\t'.join(groups)) fout.write('\n') np.savetxt(fout, FST_mat, fmt='%5.4f', delimiter='\t')
def copy_to_msa(self): segments = self.normalize_position() src_mseqs = self.model() dest_mseqs = bioio.multisequence() #print segments for s in src_mseqs: dest_mseqs.append(s.clone().set_sequence(b''.join( [s[x:y + 1] for (y, x) in segments]))) return dest_mseqs
def concat_sequencs( multiseqs ): seqnames = {} new_mseq = bioio.multisequence() for seq in multiseqs[0]: s = seq.clone() s.set_sequence( s.get_sequence() ) new_mseq.append( s ) seqnames[s.get_label()] = True for multiseq in multiseqs[1:]: label = s.get_label() if
def main(args): container = None for infile in args.files: obj = bioio.load(infile, options=args.io_opts or []) cout('reading %d sequences from %s' % (len(obj), infile)) if container is None: container = obj else: container += obj append_attributes(container, args.src, args.src_isolate, args.definition) if args.accno: set_label_to_accno(container) if args.degap: container.degap() if args.minlen > 0 or args.maxlen > 0 or args.maxN > 0: new_container = bioio.multisequence() for s in container: if args.minlen > 0 and len(s) < args.minlen: continue if args.maxlen > 0 and len(s) > args.maxlen: continue if args.maxN > 0 and s.seq.count(b'N') / len(s) > args.maxN: continue new_container.append(s) container = new_container if args.sort: if args.sort.startswith('len'): container.sort(lambda x: len(x), reverse=True) elif args.sort.startswith('lab'): container.sort(lambda x: x.label) if args.summary: for s in container: seq = s.seq.upper() print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" % (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'), seq.count(b'G'), seq.count(b'T'), seq.count(b'-'))) if args.outfile: bioio.save(container, args.outfile, options=args.io_opts or [])
def main( args ): aaseqs = bioio.multisequence() for infile in args.files: mseq = bioio.load( infile, options = args.io_opts ) cout('reading %d sequences from %s' % (len(mseq), infile)) for seq in mseq: aaseq = seq.clone() aaseq.set_sequence( funcs.translated(seq, start_pos = args.start_codon ) ) aaseqs.append( aaseq ) bioio.save( aaseqs, args.outfile )
def dereplicate(mseq): from seqpy.core.bioio import biosequence, multisequence dedups = {} for s in mseq: if str(s.seq) in dedups: dedups[str(s.seq)][1].append( s.label ) else: dedups[str(s.seq)] = (s.seq, [ s.label ] ) dedupseqs = multisequence() for (k, v) in dedups.items(): dedupseqs.append( biosequence( '#'.join( v[1] ), v[0] ) ) return dedupseqs
def gather_consensus( args ): # set output directory args.outdir = args.indir + '-results' if not args.outdir else args.outdir # open input file cons = multisequence() header = None stat_lines = [] if args.add: seqs = load(args.add) cons.extend( seqs ) for indir in sorted(os.listdir(args.indir)): seqpath = os.path.join(args.indir, indir, args.consfile) print(args.indir, indir, args.consfile, seqpath) try: seqs = load(seqpath) except FileNotFoundError: cerr('[WARN: no such file: %s]' % (seqpath) ) continue cons.append( seqs[0] ) statpath = os.path.join(args.indir, indir, args.statfile) with open(statpath) as fin: lines = fin.read().split('\n') if not header: header = lines[0].strip() stat_lines.append( lines[1].strip() ) try: os.mkdir(args.outdir) except: pass save( cons, os.path.join(args.outdir, 'consensus.fas' ) ) with open( os.path.join(args.outdir, 'stats.tsv'), 'w') as fout: fout.write(header) fout.write('\n') fout.write('\n'.join(stat_lines)) cerr(f'[Writing results to directory {args.outdir}]')
def main(args): mseq = bioio.multisequence() for infile in args.files: trace = bioio.load(infile) result = traceutils.trim(trace, args.winsize, args.qual_threshold) if not result: continue bases, quals, upstream_trim, downstream_trim = result seq = bioio.biosequence(infile, bases) seq.add_attr('upstream_trim', str(upstream_trim)) seq.add_attr('downstream_trim', str(downstream_trim)) mseq.append(seq) bioio.save(mseq, args.outfile)
def seq2pi(args): # open and read sequence file cerr('[I - reading sequence file %s]' % args.infile) seqs = load(args.infile) # open and read group/meta file using groupfile/metafile if available if args.groupfile or args.metafile: cerr('[I - reading group information file]') group_parser = grpparser.GroupParser(args) group_parser.parse() group_seqs = {} for seq in seqs: try: grp = group_parser.group_info[seq.label.decode('ASCII')] except KeyError: cerr('[W - sample %s is not assign to any group]' % seq.label.decode('ASCII')) continue if grp in group_seqs: group_seqs[grp].append(seq) else: ms = multisequence() ms.append(seq) group_seqs[grp] = ms else: group_seqs = {'ALL': seqs} print('Groups:') outf = open(args.outfile, 'w') if args.outfile else None if outf: outf.write('GROUP\tN\tPI\tSTDDEV\n') for g in group_seqs: avg, stddev = calc_pi(group_seqs[g]) cout(' %20s [%3d]: %f +- %f' % (g, len(group_seqs[g]), avg, stddev)) if outf: outf.write('%s\t%d\t%5.4f\t%5.4f\n' % (g, len(group_seqs[g]), avg, stddev)) if outf: cerr('[I - result written to %s' % args.outfile)
def main(args): circseqs = bioio.multisequence() mseq = bioio.load(args.infile, options=args.io_opts) rseq = bioio.load(args.reffile) for seq in mseq: circseq = seq.clone() if args.minlen > 0 and len(seq) > args.minlen: print('seq:', circseq.label) circseq.set_sequence( recircularize_sequence(seq.seq, rseq[0].seq, max_mismatch=args.max_mismatch)) else: circseq.set_sequence(seq.seq) circseqs.append(circseq) bioio.save(circseqs, args.outfile)
def main(args): import dendropy tree = dendropy.Tree.get(path=args.treefile, schema="newick") pdc = tree.phylogenetic_distance_matrix() cerr('Reading: %d taxa' % len(tree.taxon_namespace)) if args.collect > 0: ref_seqs = bioio.load(args.reffile) ref_taxa = [] for taxon in tree.taxon_namespace: if ref_seqs.get_by_label(taxon.label) != None: print('appended') ref_taxa.append(taxon) cerr('Referenced: %d taxa' % len(ref_taxa)) collected_taxa = set() for t1 in ref_taxa: d = [] for t2 in tree.taxon_namespace[:-1]: d.append((pdc(t1, t2), t2)) d.sort() for i in range(args.collect): collected_taxa.add(d[i][1]) collected_taxa.add(t1) cerr('Collected: %d taxa' % len(collected_taxa)) db_seqs = bioio.load(args.dbfile) mseq = bioio.multisequence() for taxon in collected_taxa: mseq.append(db_seqs.get_by_label(taxon.label)) bioio.save(mseq, args.outfile)
def main(args): circseqs = bioio.multisequence() mseq = bioio.load(args.infile, options=args.io_opts) rseq = bioio.load(args.reffile) for seq in mseq: if seq.label != 'NODE_2_length_4501_cov_41.785': continue if len(seq) < len(rseq[0]): cerr('WARNING: %s is shorter than reference' % seq.label) circseq = seq.clone() if args.minlen > 0 and len(seq) > args.minlen: print('seq:', circseq.label) circseq.set_sequence( recircularize_sequence(seq.seq, rseq[0].seq, max_mismatch=args.max_mismatch)) else: circseq.set_sequence(seq.seq) circseqs.append(circseq) bioio.save(circseqs, args.outfile)
def init_params(self): self.mseq = bioio.multisequence() self.chr_used = defaultdict(int)
def file_new(self): obj = bioio.multisequence() obj.set_filename('untitled') self.view( obj )