コード例 #1
0
def main(args):

    aaseqs = bioio.multisequence()

    if args.start_sequence:
        args.start_sequence = args.start_sequence.upper().encode('ASCII')

    for infile in args.files:

        mseq = bioio.load(infile, options=args.io_opts)
        cout('reading %d sequences from %s' % (len(mseq), infile))

        for seq in mseq:
            aaseq = seq.clone()
            if args.start_sequence:
                # we use search restriction pattern function to locate
                # the position
                target_seq = funcs.uppercased(funcs.degapped(seq))
                res = funcs.search_restriction_site(target_seq,
                                                    args.start_sequence)
                if len(res) != 1:
                    continue
                print(target_seq[res[0][0]:res[0][0] + 30])
                aaseq.set_sequence(
                    funcs.translated(target_seq, start_pos=res[0][0] + 1))
            else:
                aaseq.set_sequence(
                    funcs.translated(seq, start_pos=args.start_codon))
            aaseqs.append(aaseq)

    bioio.save(aaseqs, args.outfile)
コード例 #2
0
ファイル: seq2fst.py プロジェクト: trmznt/pys
def count_allele(mseqs):

    full_mseqs = multisequence()
    for grp in mseqs:
        full_mseqs.extend(mseqs[grp])

    na_profiles = profiles.na_profile(full_mseqs, additional_ignore=b'X')
    consensus_seq = na_profiles.consensus(0.1)

    allele_counts = {}
    for grp in mseqs:
        allele_count = np.zeros((len(consensus_seq), 2))
        mseq = mseqs[grp]
        for j in range(len(mseq)):
            seq = mseq[j].seq
            for i in range(len(consensus_seq)):
                if seq[i] == b'N':
                    allele_count[i, 0] += 1
                    allele_count[i, 1] += 1
                elif seq[i] == b'X':
                    continue
                elif seq[i] == consensus_seq[i]:
                    allele_count[i, 0] += 2
                else:
                    allele_count[i, 1] += 2

        allele_counts[grp] = allele_count

    return allele_counts
コード例 #3
0
 def copy_to_msa(self):
     msa = bioio.multisequence()
     indices = list(self._index_selections)
     indices.sort()
     for idx in indices:
         msa.append(self.model()[idx])
     return msa
コード例 #4
0
ファイル: seq2fst.py プロジェクト: trmznt/pys
def to_genotype_array(mseqs):

    # genotype array is array of site vs sample vs [0,0]
    # 0 for the major allele, 1 for the minor allele

    # create matrix profile first
    full_mseqs = multisequence()
    indexes = {}
    idx = 0
    for grp in mseqs:
        full_mseqs.extend(mseqs[grp])
        indexes[grp] = list(range(idx, idx + len(mseqs[grp])))
        idx += len(mseqs[grp])

    na_profiles = profiles.na_profile(full_mseqs, additional_ignore=b'X')
    consensus_seq = na_profiles.consensus(0.1)

    genotype_array = np.zeros((len(consensus_seq), len(full_mseqs), 2),
                              dtype=int)
    for i, j in itertools.product(range(len(full_mseqs)),
                                  range(len(consensus_seq))):
        seq = full_mseqs[0].seq
        if seq[j] == b'N':
            genotype_array[j, i] == [0, 1]
        elif seq[j] == b'X':
            genotype_array[j, i] == [-1, -1]
        elif seq[j] == consensus_seq[j]:
            genotype_array[j, i] == [0, 0]
        else:
            genotype_array[j, i] == [1, 1]

    return genotype_array, indexes
コード例 #5
0
ファイル: gb2table.py プロジェクト: trmznt/seqpy
def main(args):

    tables = []
    container = bioio.multisequence()

    n = 1
    for infile in args.files:

        mseqs = bioio.load(infile)

        mseqs.sort(lambda x: x.label)

        for s in mseqs:
            tables.append((
                n,
                s.label,
                s.attr.get('collection_date', ''),
                s.attr.get('country', ''),
                s.attr.get('isolate', ''),
                s.definition,
            ))
            container.append(bioio.biosequence('%04d' % n, s.seq.upper()))
            n += 1

    # write to output file
    tabfile = open(args.tabfile, 'w')
    tabfile.write('LABEL\tACCNO\tDATE\tCOUNTRY\tISOLATE\tDEFINITION\n')
    tables.sort()
    for r in tables:
        tabfile.write('%04d\t%s\t%s\t%s\t%s\t%s\n' % r)
    tabfile.close()

    bioio.save(container, args.outfile)
コード例 #6
0
    def __init__(self, dna_msa, start_atg, msa_signals=None):
        super(TranslatedMSA, self).__init__(bioio.multisequence())

        self._src_msa = dna_msa
        self._start_atg = start_atg
        self._na = False

        self.retranslate()
コード例 #7
0
ファイル: seq2fst.py プロジェクト: trmznt/pys
def seq2fst(args):

    # open and read sequence file
    cerr('[I - reading sequence file %s]' % args.infile)
    seqs = load(args.infile)

    # open and read group/meta file using groupfile/metafile if available
    if args.groupfile or args.metafile:
        cerr('[I - reading group information file]')
        group_parser = grpparser.GroupParser(args)
        group_parser.parse()

        group_seqs = {}

        for seq in seqs:
            try:
                grp = group_parser.group_info[seq.label.decode('ASCII')]
            except KeyError:
                cerr('[W - sample %s is not assign to any group]' %
                     seq.label.decode('ASCII'))
                continue
            if grp in group_seqs:
                group_seqs[grp].append(seq)
            else:
                ms = multisequence()
                ms.append(seq)
                group_seqs[grp] = ms
    else:
        cexit('[ERR - seq2fst.py requires group information!]')

    for grp_seq in group_seqs:
        cerr('[I - group %s has %d sample(s)]' %
             (grp_seq, len(group_seqs[grp_seq])))

    if args.sitefile:
        # perform FST site-wise
        FST_sites = calc_site_fst(group_seqs, args.nantozero)

        with open(args.sitefile, 'w') as fout:
            for (label, mat) in FST_sites:
                fout.write(label)
                fout.write('\t')
                np.savetxt(fout,
                           mat,
                           fmt='%5.4f',
                           delimiter='\t',
                           newline='\t')
                fout.write('\n')

        cerr('[I - site FST written to %s]' % (args.sitefile))
        return

    FST_mat, groups = calc_fst(group_seqs)

    with open(args.outfile, 'w') as fout:
        fout.write('\t'.join(groups))
        fout.write('\n')
        np.savetxt(fout, FST_mat, fmt='%5.4f', delimiter='\t')
コード例 #8
0
 def copy_to_msa(self):
     segments = self.normalize_position()
     src_mseqs = self.model()
     dest_mseqs = bioio.multisequence()
     #print segments
     for s in src_mseqs:
         dest_mseqs.append(s.clone().set_sequence(b''.join(
             [s[x:y + 1] for (y, x) in segments])))
     return dest_mseqs
コード例 #9
0
def concat_sequencs( multiseqs ):

    seqnames = {}
    new_mseq = bioio.multisequence()
    for seq in multiseqs[0]:
        s = seq.clone()
        s.set_sequence( s.get_sequence() )
        new_mseq.append( s )
        seqnames[s.get_label()] = True

    for multiseq in multiseqs[1:]:
        label = s.get_label()
        if 
コード例 #10
0
ファイル: readseq.py プロジェクト: trmznt/seqpy
def main(args):

    container = None

    for infile in args.files:

        obj = bioio.load(infile, options=args.io_opts or [])
        cout('reading %d sequences from %s' % (len(obj), infile))
        if container is None:
            container = obj
        else:
            container += obj

    append_attributes(container, args.src, args.src_isolate, args.definition)

    if args.accno:
        set_label_to_accno(container)

    if args.degap:
        container.degap()

    if args.minlen > 0 or args.maxlen > 0 or args.maxN > 0:
        new_container = bioio.multisequence()
        for s in container:
            if args.minlen > 0 and len(s) < args.minlen:
                continue
            if args.maxlen > 0 and len(s) > args.maxlen:
                continue
            if args.maxN > 0 and s.seq.count(b'N') / len(s) > args.maxN:
                continue
            new_container.append(s)

        container = new_container

    if args.sort:
        if args.sort.startswith('len'):
            container.sort(lambda x: len(x), reverse=True)
        elif args.sort.startswith('lab'):
            container.sort(lambda x: x.label)

    if args.summary:
        for s in container:
            seq = s.seq.upper()
            print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" %
                  (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'),
                   seq.count(b'G'), seq.count(b'T'), seq.count(b'-')))

    if args.outfile:
        bioio.save(container, args.outfile, options=args.io_opts or [])
コード例 #11
0
ファイル: translate.py プロジェクト: afifai/seqpy
def main( args ):

    aaseqs = bioio.multisequence()

    for infile in args.files:

        mseq = bioio.load( infile, options = args.io_opts )
        cout('reading %d sequences from %s' % (len(mseq), infile))

        for seq in mseq:
            aaseq = seq.clone()
            aaseq.set_sequence( funcs.translated(seq, start_pos = args.start_codon ) )
            aaseqs.append( aaseq )

    bioio.save( aaseqs, args.outfile )
コード例 #12
0
ファイル: funcs.py プロジェクト: trmznt/seqpy
def dereplicate(mseq):

    from seqpy.core.bioio import biosequence, multisequence

    dedups = {}
    for s in mseq:
        if str(s.seq) in dedups:
            dedups[str(s.seq)][1].append( s.label )
        else:
            dedups[str(s.seq)] = (s.seq, [ s.label ] )

    dedupseqs = multisequence()
    for (k, v) in dedups.items():
        dedupseqs.append( biosequence( '#'.join( v[1] ), v[0] ) )

    return dedupseqs
コード例 #13
0
def gather_consensus( args ):

    # set output directory
    args.outdir = args.indir + '-results' if not args.outdir else args.outdir

    # open input file
    cons = multisequence()
    header = None
    stat_lines = []

    if args.add:
        seqs = load(args.add)
        cons.extend( seqs )

    for indir in sorted(os.listdir(args.indir)):

        seqpath = os.path.join(args.indir, indir, args.consfile)
        print(args.indir, indir, args.consfile, seqpath)
        try:
            seqs = load(seqpath)
        except FileNotFoundError:
            cerr('[WARN: no such file: %s]' % (seqpath) )
            continue

        cons.append( seqs[0] )

        statpath = os.path.join(args.indir, indir, args.statfile)
        with open(statpath) as fin:
            lines = fin.read().split('\n')
            if not header:
                header = lines[0].strip()
            stat_lines.append( lines[1].strip() )

    try:
        os.mkdir(args.outdir)
    except:
        pass

    save( cons, os.path.join(args.outdir, 'consensus.fas' ) )
    with open( os.path.join(args.outdir, 'stats.tsv'), 'w') as fout:
        fout.write(header)
        fout.write('\n')
        fout.write('\n'.join(stat_lines))

    cerr(f'[Writing results to directory {args.outdir}]')
コード例 #14
0
def main(args):

    mseq = bioio.multisequence()

    for infile in args.files:
        trace = bioio.load(infile)
        result = traceutils.trim(trace, args.winsize, args.qual_threshold)
        if not result:
            continue

        bases, quals, upstream_trim, downstream_trim = result
        seq = bioio.biosequence(infile, bases)
        seq.add_attr('upstream_trim', str(upstream_trim))
        seq.add_attr('downstream_trim', str(downstream_trim))

        mseq.append(seq)

    bioio.save(mseq, args.outfile)
コード例 #15
0
ファイル: seq2pi.py プロジェクト: trmznt/pys
def seq2pi(args):

    # open and read sequence file
    cerr('[I - reading sequence file %s]' % args.infile)
    seqs = load(args.infile)

    # open and read group/meta file using groupfile/metafile if available
    if args.groupfile or args.metafile:
        cerr('[I - reading group information file]')
        group_parser = grpparser.GroupParser(args)
        group_parser.parse()

        group_seqs = {}

        for seq in seqs:
            try:
                grp = group_parser.group_info[seq.label.decode('ASCII')]
            except KeyError:
                cerr('[W - sample %s is not assign to any group]' %
                     seq.label.decode('ASCII'))
                continue
            if grp in group_seqs:
                group_seqs[grp].append(seq)
            else:
                ms = multisequence()
                ms.append(seq)
                group_seqs[grp] = ms
    else:
        group_seqs = {'ALL': seqs}

    print('Groups:')
    outf = open(args.outfile, 'w') if args.outfile else None
    if outf:
        outf.write('GROUP\tN\tPI\tSTDDEV\n')
    for g in group_seqs:
        avg, stddev = calc_pi(group_seqs[g])
        cout('  %20s [%3d]: %f +- %f' % (g, len(group_seqs[g]), avg, stddev))
        if outf:
            outf.write('%s\t%d\t%5.4f\t%5.4f\n' %
                       (g, len(group_seqs[g]), avg, stddev))

    if outf:
        cerr('[I - result written to %s' % args.outfile)
コード例 #16
0
ファイル: recircularize.py プロジェクト: trmznt/seqpy
def main(args):

    circseqs = bioio.multisequence()

    mseq = bioio.load(args.infile, options=args.io_opts)
    rseq = bioio.load(args.reffile)

    for seq in mseq:
        circseq = seq.clone()
        if args.minlen > 0 and len(seq) > args.minlen:
            print('seq:', circseq.label)
            circseq.set_sequence(
                recircularize_sequence(seq.seq,
                                       rseq[0].seq,
                                       max_mismatch=args.max_mismatch))
        else:
            circseq.set_sequence(seq.seq)
        circseqs.append(circseq)

    bioio.save(circseqs, args.outfile)
コード例 #17
0
ファイル: patdist.py プロジェクト: trmznt/seqpy
def main(args):

    import dendropy

    tree = dendropy.Tree.get(path=args.treefile, schema="newick")

    pdc = tree.phylogenetic_distance_matrix()

    cerr('Reading: %d taxa' % len(tree.taxon_namespace))

    if args.collect > 0:

        ref_seqs = bioio.load(args.reffile)

        ref_taxa = []
        for taxon in tree.taxon_namespace:
            if ref_seqs.get_by_label(taxon.label) != None:
                print('appended')
                ref_taxa.append(taxon)

        cerr('Referenced: %d taxa' % len(ref_taxa))

        collected_taxa = set()
        for t1 in ref_taxa:
            d = []
            for t2 in tree.taxon_namespace[:-1]:
                d.append((pdc(t1, t2), t2))
            d.sort()

            for i in range(args.collect):
                collected_taxa.add(d[i][1])
            collected_taxa.add(t1)

        cerr('Collected: %d taxa' % len(collected_taxa))

        db_seqs = bioio.load(args.dbfile)
        mseq = bioio.multisequence()
        for taxon in collected_taxa:
            mseq.append(db_seqs.get_by_label(taxon.label))

        bioio.save(mseq, args.outfile)
コード例 #18
0
def main(args):

    circseqs = bioio.multisequence()

    mseq = bioio.load(args.infile, options=args.io_opts)
    rseq = bioio.load(args.reffile)

    for seq in mseq:
        if seq.label != 'NODE_2_length_4501_cov_41.785': continue
        if len(seq) < len(rseq[0]):
            cerr('WARNING: %s is shorter than reference' % seq.label)
        circseq = seq.clone()
        if args.minlen > 0 and len(seq) > args.minlen:
            print('seq:', circseq.label)
            circseq.set_sequence(
                recircularize_sequence(seq.seq,
                                       rseq[0].seq,
                                       max_mismatch=args.max_mismatch))
        else:
            circseq.set_sequence(seq.seq)
        circseqs.append(circseq)

    bioio.save(circseqs, args.outfile)
コード例 #19
0
 def init_params(self):
     self.mseq = bioio.multisequence()
     self.chr_used = defaultdict(int)
コード例 #20
0
ファイル: actions.py プロジェクト: trmznt/insane
 def file_new(self):
     obj = bioio.multisequence()
     obj.set_filename('untitled')
     self.view( obj )