Beispiel #1
0
def fas2table(args):

    msa = load(args.infile)
    ref = load(args.reffile)

    table = generate_table(msa, ref)

    with open(args.outfile, 'w') as fout:
        for (label, muts) in table:
            fout.write('%s/\t%s\n' % (label, ' '.join(muts)))

    cerr('[Writing table to %s]' % args.outfile)
Beispiel #2
0
def map_sequences():

    contigs = bioio.load(contigsfile)
    rseq = bioio.load(args.reffile)

    for contig in contigs:

        # map contig to ref sequence
        start, end, mismatch, _, _ = map_sequence(contig, ref, max_mismatch)
        if start < 0:
            contig = funcs.reverse_complemented(contig)
            start, end, mismatch, _, _ = map_sequence(contig, ref,
                                                      max_mismatch)
            if start < 0:
                continue
Beispiel #3
0
def main(args):

    tables = []
    container = bioio.multisequence()

    n = 1
    for infile in args.files:

        mseqs = bioio.load(infile)

        mseqs.sort(lambda x: x.label)

        for s in mseqs:
            tables.append((
                n,
                s.label,
                s.attr.get('collection_date', ''),
                s.attr.get('country', ''),
                s.attr.get('isolate', ''),
                s.definition,
            ))
            container.append(bioio.biosequence('%04d' % n, s.seq.upper()))
            n += 1

    # write to output file
    tabfile = open(args.tabfile, 'w')
    tabfile.write('LABEL\tACCNO\tDATE\tCOUNTRY\tISOLATE\tDEFINITION\n')
    tables.sort()
    for r in tables:
        tabfile.write('%04d\t%s\t%s\t%s\t%s\t%s\n' % r)
    tabfile.close()

    bioio.save(container, args.outfile)
Beispiel #4
0
def main(args):

    container = None

    for infile in args.files:

        obj = bioio.load(infile, options=args.io_opts or [])
        cout('reading %d sequences from %s' % (len(obj), infile))
        if container is None:
            container = obj
        else:
            container += obj

    indexes = []
    counter = 0
    for s in container:
        counter += 1
        new_label = '%04d' % counter
        indexes.append((new_label, s.label))
        s.label = new_label

    if args.outfile:
        bioio.save(container, args.outfile, options=args.io_opts or [])

    if args.tabfile:
        with open(args.tabfile, 'w') as f:
            for i in indexes:
                f.write('%s\t%s\n' % i)
Beispiel #5
0
def main(args):

    aaseqs = bioio.multisequence()

    if args.start_sequence:
        args.start_sequence = args.start_sequence.upper().encode('ASCII')

    for infile in args.files:

        mseq = bioio.load(infile, options=args.io_opts)
        cout('reading %d sequences from %s' % (len(mseq), infile))

        for seq in mseq:
            aaseq = seq.clone()
            if args.start_sequence:
                # we use search restriction pattern function to locate
                # the position
                target_seq = funcs.uppercased(funcs.degapped(seq))
                res = funcs.search_restriction_site(target_seq,
                                                    args.start_sequence)
                if len(res) != 1:
                    continue
                print(target_seq[res[0][0]:res[0][0] + 30])
                aaseq.set_sequence(
                    funcs.translated(target_seq, start_pos=res[0][0] + 1))
            else:
                aaseq.set_sequence(
                    funcs.translated(seq, start_pos=args.start_codon))
            aaseqs.append(aaseq)

    bioio.save(aaseqs, args.outfile)
Beispiel #6
0
def seq2fst(args):

    # open and read sequence file
    cerr('[I - reading sequence file %s]' % args.infile)
    seqs = load(args.infile)

    # open and read group/meta file using groupfile/metafile if available
    if args.groupfile or args.metafile:
        cerr('[I - reading group information file]')
        group_parser = grpparser.GroupParser(args)
        group_parser.parse()

        group_seqs = {}

        for seq in seqs:
            try:
                grp = group_parser.group_info[seq.label.decode('ASCII')]
            except KeyError:
                cerr('[W - sample %s is not assign to any group]' %
                     seq.label.decode('ASCII'))
                continue
            if grp in group_seqs:
                group_seqs[grp].append(seq)
            else:
                ms = multisequence()
                ms.append(seq)
                group_seqs[grp] = ms
    else:
        cexit('[ERR - seq2fst.py requires group information!]')

    for grp_seq in group_seqs:
        cerr('[I - group %s has %d sample(s)]' %
             (grp_seq, len(group_seqs[grp_seq])))

    if args.sitefile:
        # perform FST site-wise
        FST_sites = calc_site_fst(group_seqs, args.nantozero)

        with open(args.sitefile, 'w') as fout:
            for (label, mat) in FST_sites:
                fout.write(label)
                fout.write('\t')
                np.savetxt(fout,
                           mat,
                           fmt='%5.4f',
                           delimiter='\t',
                           newline='\t')
                fout.write('\n')

        cerr('[I - site FST written to %s]' % (args.sitefile))
        return

    FST_mat, groups = calc_fst(group_seqs)

    with open(args.outfile, 'w') as fout:
        fout.write('\t'.join(groups))
        fout.write('\n')
        np.savetxt(fout, FST_mat, fmt='%5.4f', delimiter='\t')
def gather_consensus( args ):

    # set output directory
    args.outdir = args.indir + '-results' if not args.outdir else args.outdir

    # open input file
    cons = multisequence()
    header = None
    stat_lines = []

    if args.add:
        seqs = load(args.add)
        cons.extend( seqs )

    for indir in sorted(os.listdir(args.indir)):

        seqpath = os.path.join(args.indir, indir, args.consfile)
        print(args.indir, indir, args.consfile, seqpath)
        try:
            seqs = load(seqpath)
        except FileNotFoundError:
            cerr('[WARN: no such file: %s]' % (seqpath) )
            continue

        cons.append( seqs[0] )

        statpath = os.path.join(args.indir, indir, args.statfile)
        with open(statpath) as fin:
            lines = fin.read().split('\n')
            if not header:
                header = lines[0].strip()
            stat_lines.append( lines[1].strip() )

    try:
        os.mkdir(args.outdir)
    except:
        pass

    save( cons, os.path.join(args.outdir, 'consensus.fas' ) )
    with open( os.path.join(args.outdir, 'stats.tsv'), 'w') as fout:
        fout.write(header)
        fout.write('\n')
        fout.write('\n'.join(stat_lines))

    cerr(f'[Writing results to directory {args.outdir}]')
Beispiel #8
0
def main( args ):

    mseq = bioio.load( args.infile, options = args.io_opts )
    cout('reading %d sequences from %s' % (len(mseq), args.infile))
    c_mseq = funcs.condensed( mseq )
    bioio.save( c_mseq, args.outfile )

    if args.report:
        write_report(c_mseq, args.report)
Beispiel #9
0
def main(args):

    circseqs = bioio.multisequence()

    mseq = bioio.load(args.infile, options=args.io_opts)
    rseq = bioio.load(args.reffile)

    for seq in mseq:
        circseq = seq.clone()
        if args.minlen > 0 and len(seq) > args.minlen:
            print('seq:', circseq.label)
            circseq.set_sequence(
                recircularize_sequence(seq.seq,
                                       rseq[0].seq,
                                       max_mismatch=args.max_mismatch))
        else:
            circseq.set_sequence(seq.seq)
        circseqs.append(circseq)

    bioio.save(circseqs, args.outfile)
Beispiel #10
0
def main(args):

    import dendropy

    tree = dendropy.Tree.get(path=args.treefile, schema="newick")

    pdc = tree.phylogenetic_distance_matrix()

    cerr('Reading: %d taxa' % len(tree.taxon_namespace))

    if args.collect > 0:

        ref_seqs = bioio.load(args.reffile)

        ref_taxa = []
        for taxon in tree.taxon_namespace:
            if ref_seqs.get_by_label(taxon.label) != None:
                print('appended')
                ref_taxa.append(taxon)

        cerr('Referenced: %d taxa' % len(ref_taxa))

        collected_taxa = set()
        for t1 in ref_taxa:
            d = []
            for t2 in tree.taxon_namespace[:-1]:
                d.append((pdc(t1, t2), t2))
            d.sort()

            for i in range(args.collect):
                collected_taxa.add(d[i][1])
            collected_taxa.add(t1)

        cerr('Collected: %d taxa' % len(collected_taxa))

        db_seqs = bioio.load(args.dbfile)
        mseq = bioio.multisequence()
        for taxon in collected_taxa:
            mseq.append(db_seqs.get_by_label(taxon.label))

        bioio.save(mseq, args.outfile)
Beispiel #11
0
def main(args):

    circseqs = bioio.multisequence()

    mseq = bioio.load(args.infile, options=args.io_opts)
    rseq = bioio.load(args.reffile)

    for seq in mseq:
        if seq.label != 'NODE_2_length_4501_cov_41.785': continue
        if len(seq) < len(rseq[0]):
            cerr('WARNING: %s is shorter than reference' % seq.label)
        circseq = seq.clone()
        if args.minlen > 0 and len(seq) > args.minlen:
            print('seq:', circseq.label)
            circseq.set_sequence(
                recircularize_sequence(seq.seq,
                                       rseq[0].seq,
                                       max_mismatch=args.max_mismatch))
        else:
            circseq.set_sequence(seq.seq)
        circseqs.append(circseq)

    bioio.save(circseqs, args.outfile)
Beispiel #12
0
def main(args):

    container = None

    for infile in args.files:

        obj = bioio.load(infile, options=args.io_opts or [])
        cout('reading %d sequences from %s' % (len(obj), infile))
        if container is None:
            container = obj
        else:
            container += obj

    append_attributes(container, args.src, args.src_isolate, args.definition)

    if args.accno:
        set_label_to_accno(container)

    if args.degap:
        container.degap()

    if args.minlen > 0 or args.maxlen > 0 or args.maxN > 0:
        new_container = bioio.multisequence()
        for s in container:
            if args.minlen > 0 and len(s) < args.minlen:
                continue
            if args.maxlen > 0 and len(s) > args.maxlen:
                continue
            if args.maxN > 0 and s.seq.count(b'N') / len(s) > args.maxN:
                continue
            new_container.append(s)

        container = new_container

    if args.sort:
        if args.sort.startswith('len'):
            container.sort(lambda x: len(x), reverse=True)
        elif args.sort.startswith('lab'):
            container.sort(lambda x: x.label)

    if args.summary:
        for s in container:
            seq = s.seq.upper()
            print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" %
                  (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'),
                   seq.count(b'G'), seq.count(b'T'), seq.count(b'-')))

    if args.outfile:
        bioio.save(container, args.outfile, options=args.io_opts or [])
Beispiel #13
0
def statseq( args ):

    mseq = bioio.load( args.infile, options = args.io_opts or [] )

    for s in mseq:
        seq = s.seq.upper()
        A_ = seq.count(b'A')
        C_ = seq.count(b'C')
        G_ = seq.count(b'G')
        T_ = seq.count(b'T')
        N_ = seq.count(b'N')
        d_ = seq.count(b'-')
        L = A_ + C_ + G_ + T_ + N_ + d_

        cout('A: %3d  C: %3d  G: %3d  T: %3d  N: %3d  -: %3d  L: %3d  |  \t%s' % (A_, C_, G_, T_, N_, d_, L, s.label))
Beispiel #14
0
def main( args ):

    aaseqs = bioio.multisequence()

    for infile in args.files:

        mseq = bioio.load( infile, options = args.io_opts )
        cout('reading %d sequences from %s' % (len(mseq), infile))

        for seq in mseq:
            aaseq = seq.clone()
            aaseq.set_sequence( funcs.translated(seq, start_pos = args.start_codon ) )
            aaseqs.append( aaseq )

    bioio.save( aaseqs, args.outfile )
Beispiel #15
0
def main(args):

    mseq = bioio.multisequence()

    for infile in args.files:
        trace = bioio.load(infile)
        result = traceutils.trim(trace, args.winsize, args.qual_threshold)
        if not result:
            continue

        bases, quals, upstream_trim, downstream_trim = result
        seq = bioio.biosequence(infile, bases)
        seq.add_attr('upstream_trim', str(upstream_trim))
        seq.add_attr('downstream_trim', str(downstream_trim))

        mseq.append(seq)

    bioio.save(mseq, args.outfile)
Beispiel #16
0
def seq2pi(args):

    # open and read sequence file
    cerr('[I - reading sequence file %s]' % args.infile)
    seqs = load(args.infile)

    # open and read group/meta file using groupfile/metafile if available
    if args.groupfile or args.metafile:
        cerr('[I - reading group information file]')
        group_parser = grpparser.GroupParser(args)
        group_parser.parse()

        group_seqs = {}

        for seq in seqs:
            try:
                grp = group_parser.group_info[seq.label.decode('ASCII')]
            except KeyError:
                cerr('[W - sample %s is not assign to any group]' %
                     seq.label.decode('ASCII'))
                continue
            if grp in group_seqs:
                group_seqs[grp].append(seq)
            else:
                ms = multisequence()
                ms.append(seq)
                group_seqs[grp] = ms
    else:
        group_seqs = {'ALL': seqs}

    print('Groups:')
    outf = open(args.outfile, 'w') if args.outfile else None
    if outf:
        outf.write('GROUP\tN\tPI\tSTDDEV\n')
    for g in group_seqs:
        avg, stddev = calc_pi(group_seqs[g])
        cout('  %20s [%3d]: %f +- %f' % (g, len(group_seqs[g]), avg, stddev))
        if outf:
            outf.write('%s\t%d\t%5.4f\t%5.4f\n' %
                       (g, len(group_seqs[g]), avg, stddev))

    if outf:
        cerr('[I - result written to %s' % args.outfile)
Beispiel #17
0
def main(args):

    mseq = bioio.load(args.infile, options=args.io_opts or [])

    print('Number of seqs: %d' % len(mseq))

    # get unique haplotype and sample cluster

    haplotypes = {}
    for seq in mseq:
        seq_hash = sha256(seq.seq)
        try:
            haplotypes[seq_hash].append(seq.label)
        except KeyError:
            haplotypes[seq_hash] = [seq.label]

    print('Number of unique haplotypes: %d' % len(haplotypes))

    for (idx, item) in enumerate(haplotypes.items()):
        k, v = item
        print('Haplo %d =>' % idx)
        for label in v:
            print('  %s' % label)
Beispiel #18
0
def main(args):

    # read tables

    tables = {}
    tabfile = open(args.tabfile)
    next(tabfile)
    for line in tabfile:
        items = line.strip().split('\t')
        tables[items[0]] = items

    mseq = bioio.load(args.infile)
    for s in mseq:
        rec = tables[s.label]
        mo = re_date.search(rec[2])
        if mo:
            year = mo.group()
        else:
            year = '-'
        #print('%s/%s/%s' % (s.label, rec[3], year))
        s.label = '%s/%s' % (s.label, year)

    bioio.save(mseq, args.outfile)
Beispiel #19
0
def main(args):

    container = None

    for infile in args.files:

        obj = bioio.load(infile, options=args.io_opts or [])
        cout('reading %d sequences from %s' % (len(obj), infile))
        if container is None:
            container = obj
        else:
            container += obj

    append_attributes(container, args.src, args.src_isolate, args.definition)

    if args.summary:
        for s in container:
            seq = s.seq.upper()
            print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" %
                  (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'),
                   seq.count(b'G'), seq.count(b'T'), seq.count(b'-')))

    if args.outfile:
        bioio.save(container, args.outfile, options=args.io_opts or [])
Beispiel #20
0
    def file_open(self, filename=None):

        if not filename:
            filename, file_filter = QtWidgets.QFileDialog.getOpenFileName( self.pane(),
                    "Open project, alignment or trace file" )

        if not filename:
            return

        cout("Loading file %s" % filename)
        if not os.path.exists( filename ):
            alert('File %s does not exists. Please check your filename!' % filename)
            return

        b = progress('Opening ' + filename)
        b.repaint()
        obj = bioio.load( filename )
        b.hide()
        del b

        if obj:
            self.view(obj)
        else:
            alert('Error reading file ' + filename +'\nUnknown file format!')
Beispiel #21
0
    def dropEvent(self, ev):
        #print 'Source:', ev.source()
        #D( ALL, "drop source: %s" % str(ev.source()) )
        if ev.mimeData().hasUrls():
            url = str(ev.mimeData().urls()[0].path())
            #D( ALL, "url: %s" % url )
            #print "will open:", str(ev.mimeData().urls()[0])
            if True:
                #filename = url[7:]
                filename = url
                obj = bioio.load( filename )
                if hasattr(obj, 'get_sequence'):
                    self.model().append( obj.get_sequence() )
                else:
                    self.model().add( obj )

                self.model().signals().ContentUpdated.emit()
                return

                if filename.endswith('.scf') or filename.endswith('.ab1'):
                    # this is a trace file, just grab the sequence data
                    from seqpy.traceio import read_trace
                    trace = bioio.load( filename )
                    self.model().append(
                        bioio.sequence( trace.name(), trace.bases() ) )
                else:
                    #D( ALL, "opening file: %s" % filename )
                    mseq = bioio.read_sequences( filename )
                    self.model().add( mseq )
                self._view.model().signals().ContentUpdated.emit()


        elif ev.source() == self._view:
            idx, _ = self._view.xy2coord(0, ev.pos().y())
            seq = self.model()[self._dragidx]
            if idx < self._dragidx:
                self.model().delete(self._dragidx)
                self.model().insert(idx, seq)
            elif idx > self._dragidx:
                self.model().insert(idx, seq)
                self.model().delete(self._dragidx)

            #self.model().conn().contentUpdated.emit()

        elif isinstance(ev.source(), type(self._view)):
            src = ev.source()
            idx, _ = self._view.xy2coord(0, ev.pos().y())

            if src.model() == self.model():
                # the same model, then just use model's move method
                if src.dnd()._dragidx != idx:
                    self.model().move( src.dnd()._dragidx, idx)
                    self.model().signals().ContentUpdated.emit()
            else:
                seq = src.model().pop(src.dnd()._dragidx)
                #src.model().delete(src.dnd()._dragidx)
                self.model().insert(idx, seq)
                src.model().signals().ContentUpdated.emit()
                self.model().signals().ContentUpdated.emit()

            #self.model().signals().contentUpdated.emit()
            #src.model().signals().contentUpdated.emit()

        else:
            D( ALL, "drop event with unknown type" )
Beispiel #22
0
def prepare_submission(args):

    out_metadata = args.outprefix + '.csv'
    out_fasta = args.outprefix + '.fas'

    # open metadata file
    if args.metafile.lower().endswith('.csv'):
        separator = ','
    elif args.metafile.lowe().endswith('.tsv'):
        separator = '\t'
    cerr(f'[Reading metadata file {args.metafile}]')
    metadata_df = pd.read_table(args.metafile, sep=separator)

    # make sure sequence name is a string (in case the the column is automatically
    # converted to number)
    metadata_df['fn'] = metadata_df['fn'].astype('str')
    metadata_df['covv_assembly_method'] = metadata_df['covv_assembly_method'].astype('str')
    metadata_df.set_index('fn', drop=False, inplace=True )

    #import IPython; IPython.embed()

    # open infile tsv
    cerr(f'[Reading infile {args.infile}]')
    submission_df = pd.read_table(args.infile, sep='\t')

    # check for available field in submission_df
    code_field = 'SAMPLE' if 'SAMPLE' in submission_df.columns else 'fn'
    submission_df[code_field] = submission_df[code_field].astype('str')

    # open sequence file
    cerr(f'[Reading sequence file {args.seqfile}]')
    mseq = bioio.load( args.seqfile )
    mseq_keys = {}
    for i in range(len(mseq)):
        mseq_keys[ mseq[i].label ] = i

    # iterate over submission_df
    used = []
    #import IPython; IPython.embed()

    for (i, s) in submission_df.iterrows():

        sample_id = s[code_field]
        r = metadata_df.loc[sample_id]

        if sample_id not in mseq_keys:
            continue

        cerr(f'[Preparing sample {sample_id}]')
        # set coverage
        # import IPython; IPython.embed()
        metadata_df.at[sample_id, 'covv_coverage'] = s['AVGDEPTH']
        metadata_df.at[sample_id, 'fn'] = out_fasta
        metadata_df.at[sample_id, 'covv_seq_technology'] = args.covv_seq_technology
        metadata_df.at[sample_id, 'covv_assembly_method'] = args.covv_assembly_method

        # set sequence name
        idx = mseq_keys[sample_id]
        mseq[idx].label = r['covv_virus_name']
        mseq[idx].seq = mseq[idx].seq.strip(b'-')
        used.append(sample_id)
        cerr(f'[Finish preparing  sample {sample_id}]')

    # remove unused metadata
    metadata_df = metadata_df.loc[ used ]

    # write to new fasta & metadata file
    metadata_df.to_csv(out_metadata, sep=',', index=False)
    bioio.save(mseq, out_fasta)
Beispiel #23
0
def main(args):

    mseq = bioio.load(args.infile, options=args.io_opts)
    cout('reading %d sequences from %s' % (len(mseq), args.infile))
    bioio.save(funcs.condensed(mseq), args.outfile)