Ejemplo n.º 1
0
    def test03(self):
        fa = path.join(datadir, 'F1_3xR1_36', 'trimmed.fasta')
        uc = path.join(datadir, 'F1_3xR1_36', 'trimmed.uc')
        groups = path.join(datadir, 'F1_3xR1_36', 'groups.csv.bz2')
        outdir = self.mkoutdir()
        limit = 500
        min_size = 2

        denoised = path.join(outdir, 'denoised.fasta')
        self.main([fa,
                   '--clusters', uc,
                   '--outfile', denoised,
                   '--limit', limit,
                   '--min-clust-size', min_size,
                   '--weights', path.join(outdir, 'weights.csv')])

        denoised_grouped = path.join(outdir, 'denoised.grouped.fasta')
        self.main([fa,
                   '--clusters', uc,
                   '--outfile', denoised_grouped,
                   '--limit', limit,
                   '--min-clust-size', min_size,
                   '--groups', groups,
                   '--weights', path.join(outdir, 'weights.grouped.csv')])

        with open(denoised) as d, open(denoised_grouped) as g:
            ds = list(fastalite(d))
            gs = list(fastalite(g))
            self.assertEqual(set(s.seq for s in ds), set(s.seq for s in gs))
Ejemplo n.º 2
0
    def test03(self):
        fa = path.join(datadir, 'F1_3xR1_36', 'trimmed.fasta')
        uc = path.join(datadir, 'F1_3xR1_36', 'trimmed.uc')
        groups = path.join(datadir, 'F1_3xR1_36', 'groups.csv.bz2')
        outdir = self.mkoutdir()
        limit = 500
        min_size = 2

        denoised = path.join(outdir, 'denoised.fasta')
        self.main([
            fa, '--clusters', uc, '--outfile', denoised, '--limit', limit,
            '--min-clust-size', min_size, '--weights',
            path.join(outdir, 'weights.csv')
        ])

        denoised_grouped = path.join(outdir, 'denoised.grouped.fasta')
        self.main([
            fa, '--clusters', uc, '--outfile', denoised_grouped, '--limit',
            limit, '--min-clust-size', min_size, '--groups', groups,
            '--weights',
            path.join(outdir, 'weights.grouped.csv')
        ])

        with open(denoised) as d, open(denoised_grouped) as g:
            ds = list(fastalite(d))
            gs = list(fastalite(g))
            self.assertEqual(set(s.seq for s in ds), set(s.seq for s in gs))
Ejemplo n.º 3
0
    def test04(self):
        """
        Include an rle file.
        """

        out = self.mkoutdir()
        trimmed = path.join(out, 'trimmed.fasta')
        trimmed_rle = path.join(out, 'trimmed_rle.csv')

        args = [
            self.data('rle_100.fasta'),
            '--left-aligns', self.data('rle_100_left_ssearch.csv.bz2'),
            '--right-zscore', '80',
            '--right-aligns', self.data('rle_100_right_ssearch.csv.bz2'),
            '--fasta-out', trimmed,
            '--rle', self.data('rle_100.csv.bz2'),
            '--rle-out', trimmed_rle
        ]

        self.main(args)

        with open(trimmed) as f:
            self.assertEqual(len(list(fastalite(f))), 66)

        with open(trimmed_rle) as f:
            self.assertEqual(len(f.readlines()), 67)
Ejemplo n.º 4
0
def action(args):
    if args.is_file:
        seqs = fastalite(opener(args.seqs))
        for s in seqs:
            seq = reversed(s.seq)
            seq = [rev_comp[se] for se in seq]
            seq = ''.join(seq)
            args.out_fasta.write('>{}\n{}\n'.format(s.description, seq))
    else:
        seq = [rev_comp[s] for s in args.seqs]
        seq = ''.join(seq)
        args.out.write(seq)
        args.out.write('\n')

    if args.rlefile and args.out_rle:
        reader = csv.reader(args.rlefile)
        writer = csv.writer(args.out_rle)

        # try to determine if first row is a header; we'll assume that
        # the first row, second column is a run-length encoding if
        # it's at least half digits.
        name, rle = reader.next()
        if sum(c.isdigit() for c in rle) / float(len(rle)) > 0.5:
            writer.writerow([name, ''.join(reversed(rle))])
        else:
            assert [name, rle] == rle_fieldnames
            writer.writerow([name, rle])

        for name, rle in reader:
            writer.writerow([name, ''.join(reversed(rle))])
Ejemplo n.º 5
0
def action(args):

    seqs = fastalite(args.fasta)
    pairs = list(all_pairwise(seqs))

    if args.distance:
        pairs = [(q, t, 1 - i) for q, t, i in pairs]

    if args.split_info and args.matrix_out:
        primary, secondary = args.primary_group, args.secondary_group
        split_info = list(csv.DictReader(args.split_info))
        info = {r['seqname']: r for r in split_info if r['seqname']}
        tax = {r['tax_id']:r for r in split_info}

        pairs += map(itemgetter(1,0,2), pairs)

        def group(seqname):
            i = info[seqname]
            return i[primary] or i[secondary] if secondary else i[primary]

        pairs = ((group(left), group(right), score) for left,right,score in pairs)

        # sort and group rows
        pairs = list(groupbyl(pairs, key = itemgetter(0)))

        matrix_out = csv.writer(args.matrix_out)

        # this is the tax_id order we will be using for columns
        tax_ids = map(itemgetter(0), pairs)

        # get the species names to output as first row
        matrix_out.writerow([''] + [tax[t]['tax_name'] for t in tax_ids])

        # iterator through the sorted rows (pairs)
        for row_id, columns in pairs:
            # sort and group columns
            columns = dict(groupbyl(columns, key = itemgetter(1)))

            # get the species name
            row = [tax[row_id]['tax_name']]

            for t in tax_ids:
                # if t not in columns that means there is only
                # sequence representing the group
                # therefore the median destance is 0
                if t not in columns:
                    med = 0
                else:
                    col = columns[t]
                    med = median(map(itemgetter(2), col))
                    # percent and round
                    med = math.ceil(med * 100) / 100

                row.append(med)

            matrix_out.writerow(row)
    else:
        writer = csv.writer(args.out)
        writer.writerow(['query', 'target', 'identity'])
        writer.writerows(pairs)
Ejemplo n.º 6
0
def build_parser(parser):
    parser.add_argument(
        'clusters', type=Opener(),
        help='Clusters file (output of "usearch -uc")')
    parser.add_argument(
        '--fasta-in', type=lambda f: fastalite(Opener()(f)),
        help='input fasta file containing original clustered reads')
    parser.add_argument(
        '--fasta-out', type=Opener('w'),
        help='Output fasta containing centroids')
    parser.add_argument(
        '-g', '--groups', metavar='FILE', type=Opener(),
        help="""An optional file defining groups for partitioning
        input reads. If provided, cluster weights will be normalized
        to proportionally represent each group. File is a headerless
        csv with columns "seqname","group" (.csv.bz2)""")
    parser.add_argument(
        '--min-clust-size', type=int,
        default=1, help='[%(default)s]')
    parser.add_argument(
        '-o', '--out', type=Opener('w'), default=sys.stdout,
        help='Output file with columns (readname,centroidname)')
    parser.add_argument(
        '--specimenmap', type=Opener('w'),
        help='Output file with columns (clustername,samplename)')
    parser.add_argument(
        '--specimen', metavar='SAMPLENAME',
        help='provides samplename for mapfile')
    parser.add_argument(
        '-w', '--weights', type=Opener('w'),
        help='Output file with columns (clustername,weight)')
Ejemplo n.º 7
0
def action(args):
    # for debugging:
    # pd.set_option('display.max_columns', None)
    # pd.set_option('display.max_rows', None)

    if args.intype == 'fasta':
        fa = fastalite(args.infile, limit=args.limit)
        df = pd.Series(data={f.id: f.seq for f in fa}, name='seq')
        df = df.reset_index()
        df = df.set_index('index')
        df.index.name = 'id'
        df['length'] = df['seq'].apply(len)
        column = 'length'
    else:  # elif args.intpe == 'csv':
        df = pd.read_csv(args.infile)
        column = args.column

    xticks = args.xaxis.split(',') if args.xaxis else None

    # do not display plots
    plt.use('Agg')

    # format blast data and add additional available information
    pl = df[column].plot(kind='kde',
                         title=args.title,
                         xticks=xticks)

    log.info('printing to {}'.format(args.out))

    pl.get_figure().savefig(args.out)
Ejemplo n.º 8
0
def build_parser(parser):
    parser.add_argument('raw_reads',
                        type=lambda f: fastalite(Opener()(f)),
                        help="""input fasta file containing original
                        clustered reads (default stdin).""")
    parser.add_argument('readmap',
                        type=Opener('r'),
                        help="""output of `bioy denoise --readmap`
                        (csv file with columns readname,clustername)""")
    parser.add_argument('-r',
                        '--rlefile',
                        type=Csv2Dict('name',
                                      'rle',
                                      fieldnames=['name', 'rle']),
                        help="""An optional file containing run
                        length encoding for infile (.csv.bz2)""")
    parser.add_argument('-d', '--outdir', help='output directory', default='.')
    parser.add_argument('--pattern',
                        help="""A regular expression matching cluster names""")
    parser.add_argument('-N',
                        '--sample',
                        type=int,
                        default=100,
                        metavar='N',
                        help='inculde no more than N reads [%(default)s]')
    parser.add_argument('--name-suffix',
                        help='string to insert into name before .fasta',
                        default='aln')
    parser.add_argument('--no-align',
                        action='store_false',
                        dest='align',
                        default=True)
Ejemplo n.º 9
0
 def test02(self):
     with open(self.data('two.fasta')) as f:
         seqs = list(sequtils.fastalite(f))
         pairs = list(sequtils.all_pairwise(seqs))
         self.assertEqual(len(pairs), (len(seqs) * (len(seqs) - 1)) / 2)
         self.assertEqual(
             [s.id for s in seqs], list(sequtils.names_from_pairs(pairs)))
Ejemplo n.º 10
0
    def test04(self):
        """
        test no clusters passing min_size

        1) test file is actully created
        2) test there are no seqs in file
        """

        fa = path.join(datadir, 'F1_3', 'trimmed.fasta')
        uc = path.join(datadir, 'F1_3', 'trimmed.uc')
        outdir = self.mkoutdir()
        fa_out = path.join(outdir, 'denoised_empty.fasta')
        limit = 100
        min_size = sys.maxint
        self.main([
            fa, '--clusters', uc, '--outfile', fa_out, '--limit', limit,
            '--min-clust-size', min_size
        ])

        # 1)
        self.assertTrue(os.path.isfile(fa_out))

        with open(fa_out) as out:
            outseqs = list(fastalite(out))
            # 2)
            self.assertEqual(len(outseqs), 0)
Ejemplo n.º 11
0
    def test05(self):
        """
        test if no cluster file all one cluster
        """

        fa = path.join(datadir, '16S_random.fasta')
        outdir = self.mkoutdir()
        fa_out = fa_out = path.join(outdir, 'denoised.fasta')
        self.main([fa, '--outfile', fa_out])

        reference = path.join(datadir, '16S_random_cons.fasta')
        with open(fa_out) as out, open(reference) as ref:
            outseqs = list(fastalite(out))
            refseqs = list(fastalite(ref))
            self.assertEqual(len(outseqs), len(refseqs))
            self.assertEqual(set(s.seq for s in outseqs),
                             set(s.seq for s in refseqs))
Ejemplo n.º 12
0
    def test02(self):
        fa = path.join(datadir, 'F1_3', 'trimmed.fasta')
        uc = path.join(datadir, 'F1_3', 'trimmed.uc')
        outdir = self.mkoutdir()
        fa_out = path.join(outdir, 'denoised.fasta')
        limit = 100
        min_size = 2
        self.main([fa, '--clusters', uc, '--outfile', fa_out,
                   '--limit', limit, '--min-clust-size', min_size])

        reference = path.join(datadir, 'F1_3', 'test02_denoised.fasta')
        with open(fa_out) as out, open(reference) as ref:
            outseqs = list(fastalite(out))
            refseqs = list(fastalite(ref))
            self.assertEqual(len(outseqs), len(refseqs))
            self.assertEqual(set(s.seq for s in outseqs),
                             set(s.seq for s in refseqs))
Ejemplo n.º 13
0
    def test05(self):
        """
        test if no cluster file all one cluster
        """

        fa = path.join(datadir, '16S_random.fasta')
        outdir = self.mkoutdir()
        fa_out = fa_out = path.join(outdir, 'denoised.fasta')
        self.main([fa, '--outfile', fa_out])

        reference = path.join(datadir, '16S_random_cons.fasta')
        with open(fa_out) as out, open(reference) as ref:
            outseqs = list(fastalite(out))
            refseqs = list(fastalite(ref))
            self.assertEqual(len(outseqs), len(refseqs))
            self.assertEqual(set(s.seq for s in outseqs),
                             set(s.seq for s in refseqs))
Ejemplo n.º 14
0
    def test02(self):
        fa = path.join(datadir, 'F1_3', 'trimmed.fasta')
        uc = path.join(datadir, 'F1_3', 'trimmed.uc')
        outdir = self.mkoutdir()
        fa_out = path.join(outdir, 'denoised.fasta')
        limit = 100
        min_size = 2
        self.main([
            fa, '--clusters', uc, '--outfile', fa_out, '--limit', limit,
            '--min-clust-size', min_size
        ])

        reference = path.join(datadir, 'F1_3', 'test02_denoised.fasta')
        with open(fa_out) as out, open(reference) as ref:
            outseqs = list(fastalite(out))
            refseqs = list(fastalite(ref))
            self.assertEqual(len(outseqs), len(refseqs))
            self.assertEqual(set(s.seq for s in outseqs),
                             set(s.seq for s in refseqs))
Ejemplo n.º 15
0
def build_parser(parser):
    parser.add_argument('fasta',
                        type=lambda f: fastalite(Opener()(f)),
                        help='input fasta file')
    parser.add_argument('-l',
                        '--left-aligns',
                        type=Opener(),
                        help='left primer ssearch36 alignment results')
    parser.add_argument('-r',
                        '--right-aligns',
                        type=Opener(),
                        help='right primer ssearch36 alignment results')
    parser.add_argument('--left-range',
                        metavar='START,STOP',
                        help='Range of acceptable left primer start positions')
    parser.add_argument('--left-zscore',
                        metavar='VALUE',
                        type=float,
                        help='Min acceptable left primer z-score')
    parser.add_argument('--right-range',
                        metavar='START,STOP',
                        help=('Range of acceptable right '
                              'primer start positions'))
    parser.add_argument('--right-zscore',
                        metavar='VALUE',
                        type=float,
                        help='Min acceptable right primer z-score')
    parser.add_argument('--left-expr',
                        help=('python expression defining '
                              'criteria for keeping left primer'))
    parser.add_argument('--right-expr',
                        help=('python expression defining criteria '
                              'for keeping left primer'))
    parser.add_argument('-o',
                        '--fasta-out',
                        type=Opener('w'),
                        default=sys.stdout,
                        help='trimmed fasta output file')
    parser.add_argument('--rle',
                        type=Csv2Dict('name',
                                      'rle',
                                      fieldnames=['name', 'rle']),
                        help='rle input file (required if --rle-out)')
    parser.add_argument(
        '--rle-out',
        type=lambda f: DictWriter(Opener('w')(f), fieldnames=['name', 'rle']),
        help='trimmed rle output file')
    parser.add_argument('-i',
                        '--include-primer',
                        action='store_true',
                        default=False,
                        help='Include primer in trimmed sequence')
    parser.add_argument('--keep-all-seqs',
                        action='store_true',
                        help='keep seqs that outside the trimming thresholds')
Ejemplo n.º 16
0
def build_parser(parser):
    parser.add_argument('seqs',
                        type=lambda f: fastalite(Opener()(f), readfile=False),
                        help='Input fasta file')
    parser.add_argument('rle',
                        type=Opener(),
                        help='csv file (may be bzip encoded) containing columns "name","rle"')
    parser.add_argument('-o', '--outfile',
                        type=Opener('w'),
                        default=sys.stdout,
                        help='Name of output file')
Ejemplo n.º 17
0
def action(args):
    fieldnames = args.get or ['id','description','seq']
    # make into [[columname,newname] ...]
    fieldnames = [f.split(':') for f in fieldnames]
    fieldnames = [f * (2 if len(f) == 1 else 1) for f in fieldnames]
    out = csv.DictWriter(args.out,
                         fieldnames = map(operator.itemgetter(1), fieldnames),
                         extrasaction = 'ignore')
    out.writeheader()
    for f in fastalite(args.fasta):
        f = f._asdict()
        out.writerow({v:f[k] for k,v in fieldnames})
Ejemplo n.º 18
0
def build_parser(parser):
    parser.add_argument('fasta',
            type = lambda f: fastalite(opener(f)),
            help = 'input file containing raw reads')
    parser.add_argument('--sample-id',
            help = 'sample id to pull reads for')
    parser.add_argument('--map-file',
            type = Csv2Dict(value = 'sample_id', fieldnames=['sequence_id','sample_id']),
            help = 'csv(.bz2) file containing sequence_id,sample_id in the rows.')
    parser.add_argument('-o', '--out',
            type = Opener('w'),
            default = sys.stdout,
            help = 'fasta output file')
Ejemplo n.º 19
0
def build_parser(parser):
    parser.add_argument('seqs',
                        type=lambda f: fastalite(Opener()(f), readfile=False),
                        help='Input fasta file')
    parser.add_argument(
        'rle',
        type=Opener(),
        help='csv file (may be bzip encoded) containing columns "name","rle"')
    parser.add_argument('-o',
                        '--outfile',
                        type=Opener('w'),
                        default=sys.stdout,
                        help='Name of output file')
Ejemplo n.º 20
0
    def test01(self):
        with open(
            self.data('five.fasta')) as f, open(
                self.data('five.fasta')) as r:
            seqs = sequtils.fastalite(f)
            raw = r.read()
            fasta = ''
            for seq in seqs:
                fasta += '>{}\n{}\n'.format(seq.description, seq.seq)
                log.debug('{}'.format(seq))

        self.assertEquals(
            ''.join(raw).replace('\n', ''), fasta.replace('\n', ''))
Ejemplo n.º 21
0
    def test01(self):
        fa = path.join(datadir, 'F1_3', 'trimmed.fasta')
        uc = path.join(datadir, 'F1_3', 'trimmed.uc')
        outdir = self.mkoutdir()
        fa_out = path.join(outdir, 'denoised.fasta')
        limit = 100
        self.main([fa, '--clusters', uc, '--outfile',
                   fa_out, '--limit', limit])

        # cluster mass equals number of input sequences
        with open(fa_out) as f:
            cluster_mass = sum(int(line.split('_')[-1])
                               for line in f if line.startswith('>'))
            self.assertEqual(limit, cluster_mass)

        # regression test
        reference = path.join(datadir, 'F1_3', 'test01_denoised.fasta')
        with open(fa_out) as out, open(reference) as ref:
            outseqs = list(fastalite(out))
            refseqs = list(fastalite(ref))
            self.assertEqual(len(outseqs), len(refseqs))
            self.assertEqual(set(s.seq for s in outseqs),
                             set(s.seq for s in refseqs))
Ejemplo n.º 22
0
    def test01(self):
        fa = path.join(datadir, 'F1_3', 'trimmed.fasta')
        uc = path.join(datadir, 'F1_3', 'trimmed.uc')
        outdir = self.mkoutdir()
        fa_out = path.join(outdir, 'denoised.fasta')
        limit = 100
        self.main(
            [fa, '--clusters', uc, '--outfile', fa_out, '--limit', limit])

        # cluster mass equals number of input sequences
        with open(fa_out) as f:
            cluster_mass = sum(
                int(line.split('_')[-1]) for line in f if line.startswith('>'))
            self.assertEqual(limit, cluster_mass)

        # regression test
        reference = path.join(datadir, 'F1_3', 'test01_denoised.fasta')
        with open(fa_out) as out, open(reference) as ref:
            outseqs = list(fastalite(out))
            refseqs = list(fastalite(ref))
            self.assertEqual(len(outseqs), len(refseqs))
            self.assertEqual(set(s.seq for s in outseqs),
                             set(s.seq for s in refseqs))
Ejemplo n.º 23
0
def build_parser(parser):
    parser.add_argument('fasta',
                        type=lambda f: fastalite(opener(f)),
                        help='input file containing raw reads')
    parser.add_argument('--sample-id', help='sample id to pull reads for')
    parser.add_argument(
        '--map-file',
        type=Csv2Dict(value='sample_id',
                      fieldnames=['sequence_id', 'sample_id']),
        help='csv(.bz2) file containing sequence_id,sample_id in the rows.')
    parser.add_argument('-o',
                        '--out',
                        type=Opener('w'),
                        default=sys.stdout,
                        help='fasta output file')
Ejemplo n.º 24
0
def action(args):
    if args.seqname:
        seqname = args.seqname
    else:
        seqname = 'consensus' if args.infile is sys.stdin \
                  else splitext(basename(args.infile.name))[0]

    seqs = list(fastalite(args.infile, 'fasta'))

    if args.rlefile:
        rledict = json.load(args.rlefile)
        rlelist = [rledict[s.id] for s in seqs]
        cons = consensus(seqs, rlelist, degap=not args.gaps)
    else:
        cons = consensus(seqs, degap=not args.gaps)

    args.outfile.write('>{}\n{}\n'.format(seqname, cons))
Ejemplo n.º 25
0
def action(args):
    if args.seqname:
        seqname = args.seqname
    else:
        seqname = 'consensus' if args.infile is sys.stdin \
                  else splitext(basename(args.infile.name))[0]

    seqs = list(fastalite(args.infile, 'fasta'))

    if args.rlefile:
        rledict = json.load(args.rlefile)
        rlelist = [rledict[s.id] for s in seqs]
        cons = consensus(seqs, rlelist, degap=not args.gaps)
    else:
        cons = consensus(seqs, degap=not args.gaps)

    args.outfile.write('>{}\n{}\n'.format(seqname, cons))
Ejemplo n.º 26
0
    def test01(self):
        out = self.mkoutdir()
        trimmed = path.join(out, 'trimmed.fasta')

        args = [
            self.data('rle_100.fasta'),
            '--left-aligns', self.data('rle_100_left_ssearch.csv.bz2'),
            '--right-aligns', self.data('rle_100_right_ssearch.csv.bz2'),
            '--fasta-out', trimmed
        ]

        self.main(args)

        self.assertTrue(path.exists(trimmed))

        with open(trimmed) as f:
            self.assertEqual(len(list(fastalite(f))), 99)
Ejemplo n.º 27
0
def action(args):
    fasta = fastalite(args.fasta)

    spec_map = DictReader(args.specimen_map, fieldnames = ['readname', 'specimen'])
    spec_map = {s['readname']:s['specimen'] for s in spec_map}

    def by_specimen(f):
        return spec_map[f.id]

    groups = sorted(fasta, key = by_specimen)
    groups = groupby(groups, key = by_specimen)

    for spec, fasta in groups:
        fasta = ('>{}\n{}'.format(f.description, f.seq) for f in fasta)
        fasta = '\n'.join(fasta)

        filename = path.join(args.outdir, '{}.fasta.bz2'.format(spec))

        with opener(filename, 'w') as out:
            out.write(fasta)
Ejemplo n.º 28
0
def action(args):
    fasta = fastalite(args.fasta)

    spec_map = DictReader(args.specimen_map,
                          fieldnames=['readname', 'specimen'])
    spec_map = {s['readname']: s['specimen'] for s in spec_map}

    def by_specimen(f):
        return spec_map[f.id]

    groups = sorted(fasta, key=by_specimen)
    groups = groupby(groups, key=by_specimen)

    for spec, fasta in groups:
        fasta = ('>{}\n{}'.format(f.description, f.seq) for f in fasta)
        fasta = '\n'.join(fasta)

        filename = path.join(args.outdir, '{}.fasta.bz2'.format(spec))

        with opener(filename, 'w') as out:
            out.write(fasta)
Ejemplo n.º 29
0
def build_parser(parser):
    parser.add_argument('fasta',
                        type=lambda f: fastalite(Opener()(f)),
                        help='input fasta file')
    parser.add_argument('-l', '--left-aligns', type=Opener(),
                        help='left primer ssearch36 alignment results')
    parser.add_argument('-r', '--right-aligns', type=Opener(),
                        help='right primer ssearch36 alignment results')
    parser.add_argument('--left-range', metavar='START,STOP',
                        help='Range of acceptable left primer start positions')
    parser.add_argument('--left-zscore', metavar='VALUE', type=float,
                        help='Min acceptable left primer z-score')
    parser.add_argument('--right-range', metavar='START,STOP',
                        help=('Range of acceptable right '
                              'primer start positions'))
    parser.add_argument('--right-zscore', metavar='VALUE', type=float,
                        help='Min acceptable right primer z-score')
    parser.add_argument('--left-expr',
                        help=('python expression defining '
                              'criteria for keeping left primer'))
    parser.add_argument('--right-expr',
                        help=('python expression defining criteria '
                              'for keeping left primer'))
    parser.add_argument('-o', '--fasta-out',
                        type=Opener('w'),
                        default=sys.stdout,
                        help='trimmed fasta output file')
    parser.add_argument('--rle',
                        type=Csv2Dict(
                            'name', 'rle', fieldnames=['name', 'rle']),
                        help='rle input file (required if --rle-out)')
    parser.add_argument('--rle-out',
                        type=lambda f: DictWriter(
                            Opener('w')(f), fieldnames=['name', 'rle']),
                        help='trimmed rle output file')
    parser.add_argument('-i', '--include-primer',
                        action='store_true', default=False,
                        help='Include primer in trimmed sequence')
    parser.add_argument('--keep-all-seqs', action='store_true',
                        help='keep seqs that outside the trimming thresholds')
Ejemplo n.º 30
0
def build_parser(parser):
    parser.add_argument('raw_reads',
                        type = lambda f: fastalite(Opener()(f)),
                        help = """input fasta file containing original
                        clustered reads (default stdin).""")
    parser.add_argument('readmap',
                        type = Opener('r'),
                        help = """output of `bioy denoise --readmap`
                        (csv file with columns readname,clustername)""")
    parser.add_argument('-r', '--rlefile',
                        type = Csv2Dict('name', 'rle', fieldnames=['name', 'rle']),
                        help="""An optional file containing run
                        length encoding for infile (.csv.bz2)""")
    parser.add_argument('-d', '--outdir', help='output directory', default='.')
    parser.add_argument('--pattern',
                        help = """A regular expression matching cluster names""")
    parser.add_argument('-N', '--sample', type=int, default=100,
                        metavar='N',
                        help='inculde no more than N reads [%(default)s]')
    parser.add_argument('--name-suffix',
                        help='string to insert into name before .fasta',
                        default='aln')
    parser.add_argument('--no-align',
                        action='store_false', dest='align', default=True)
Ejemplo n.º 31
0
    def test04(self):
        """
        test no clusters passing min_size

        1) test file is actully created
        2) test there are no seqs in file
        """

        fa = path.join(datadir, 'F1_3', 'trimmed.fasta')
        uc = path.join(datadir, 'F1_3', 'trimmed.uc')
        outdir = self.mkoutdir()
        fa_out = path.join(outdir, 'denoised_empty.fasta')
        limit = 100
        min_size = sys.maxint
        self.main([fa, '--clusters', uc, '--outfile', fa_out, '--limit',
                   limit, '--min-clust-size', min_size])

        # 1)
        self.assertTrue(os.path.isfile(fa_out))

        with open(fa_out) as out:
            outseqs = list(fastalite(out))
            # 2)
            self.assertEqual(len(outseqs), 0)
Ejemplo n.º 32
0
def action(args):

    if args.clusters:
        _, fileExt, = os.path.basename(args.clusters.name).split('.')

        if fileExt == 'uc':
            clusters = parse_uc(args.clusters)[0]
        else:
            clusters = {seq: tag for seq, tag in csv.reader(args.clusters)}

        by_clusters = lambda s: clusters.get(s.id, s.id)
    else:
        by_clusters = lambda _: 'all one cluster'

    seqs = fastalite(args.fastafile)
    seqs = islice(seqs, args.limit)
    seqs = sorted(seqs, key=by_clusters)
    grouped_seqs = groupby(seqs, key=by_clusters)

    chunks = ichunker((group for _, group in grouped_seqs), args.rlefile,
                      args.min_clust_size, args.max_clust_size)

    # calculate consensus for each cluster, then accumulate names of
    # each set of identical consensus sequences in `exemplars` (keys
    # are the consensus sequences themselves).
    exemplars = defaultdict(list)
    pool = Pool(processes=args.threads)
    for cluster, cons in map(align_and_consensus, enumerate(chunks, start=1)):
        exemplars[cons].extend([c.id for c in cluster])

    # calculate ratios of reads for the smallest group to each of the
    # other groups. outseqs is a list of (weight, consensus, list_of_names)
    if args.groups and exemplars:
        groups = dict(csv.reader(args.groups))
        group_counts = Counter(
            groups[name] for name in chain.from_iterable(exemplars.values()))
        most_common = group_counts.most_common()
        _, least_common = most_common[-1]
        weights = {k: float(least_common) / v for k, v in most_common}
        outseqs = [(sum(weights[groups[n]] for n in names), cons, names)
                   for cons, names in exemplars.items()]
    else:
        outseqs = [(len(names), cons, names)
                   for cons, names in exemplars.items()]

    # write each consensus sequence in descending order of weight
    outseqs.sort(reverse=True, key=itemgetter(0))
    for i, (weight, cons, names) in enumerate(outseqs, start=1):

        name_elements = [
            args.name_prefix, 'cons{:04}'.format(i), '{:.0f}'.format(weight),
            args.name_suffix
        ]

        consname = args.name_delimiter.join([e for e in name_elements if e])

        log.debug('writing {}'.format(consname))

        args.outfile.write('>{}\n{}\n'.format(consname, cons))

        if args.readmap:
            args.readmap.writerows((name, consname) for name in names)

        if args.clustermap and args.specimen:
            args.clustermap.writerow((consname, args.specimen))

        if args.weights:
            args.weights.writerow((consname, weight))
Ejemplo n.º 33
0
def action(args):

    if args.remote and not args.remote_database:
        log.error("bioy blast: error: please specify a remote database")
        return
    elif not args.remote and not args.database:
        log.error("bioy blast: error: please specify path to local database")
        return

    command = ['blastn']
    command += ['-query', args.fasta]
    if args.remote:
        command += ['-remote']
        command += ['-db', args.remote_database]
    else:
        command += ['-db', args.database]
        command += ['-num_threads', str(args.threads)]
    command += ['-perc_identity', args.id]
    command += ['-outfmt', '6 ' + args.outfmt.replace(',', ' ')]
    command += ['-strand', args.strand]

    if args.max:
        command += ['-max_target_seqs', args.max]

    log.info(' '.join(command))

    if args.dry_run:
        sys.exit(0)

    pipe = Popen(command, stdout=PIPE, stderr=PIPE)

    results, errors = pipe.communicate()

    if errors:
        log.error(errors)

    # split tab lines
    lines = (r.strip().split('\t') for r in StringIO(results))

    header = args.outfmt.split(',')
    # match with fieldnames
    lines = (zip(header, l) for l in lines)

    # make into dict
    lines = [dict(l) for l in lines]

    # Replace blast's local alignment query coverage with global coverage calculation
    if 'qcovs' in args.outfmt.split(',') or isinstance(args.coverage, float):
        for l in lines:
            l['qcovs'] = (float(l['qend']) - float(l['qstart']) + 1) \
                    / float(l['qlen']) * 100
            l['qcovs'] = '{0:.2f}'.format(l['qcovs'])
    if isinstance(args.coverage, float):
        lines = [l for l in lines if float(l['qcovs']) >= args.coverage]

    if args.nohits:
        # to get nohits first we need to know about the hits
        qids = groupby(lines, key=itemgetter('qseqid'))
        qids = set(q for q, _ in qids)

        # now we can build a list of nohits
        nohits = []
        for q in fastalite(opener(args.fasta)):
            if q.id not in qids:
                nohits.append(q)

        # convert nohits into DictWriter format
        nohits = (dict(qseqid=q.id) for q in nohits)

        # append to lines
        lines = chain(lines, nohits)

    out = DictWriter(args.out, fieldnames=header, extrasaction='ignore')

    if args.header:
        out.writeheader()

    out.writerows(lines)
Ejemplo n.º 34
0
 def setUp(self):
     self.outdir = self.mkoutdir()
     with open(self.data('two.fasta')) as f:
         self.seqs = list(sequtils.fastalite(f))
Ejemplo n.º 35
0
def action(args):

    if args.clusters:
        _, fileExt, = os.path.basename(args.clusters.name).split('.')

        if fileExt == 'uc':
            clusters = parse_uc(args.clusters)[0]
        else:
            clusters = {seq: tag for seq,tag in csv.reader(args.clusters)}

        by_clusters = lambda s: clusters.get(s.id, s.id)
    else:
        by_clusters = lambda _: 'all one cluster'

    seqs = fastalite(args.fastafile)
    seqs = islice(seqs, args.limit)
    seqs = sorted(seqs, key = by_clusters)
    grouped_seqs = groupby(seqs, key = by_clusters)

    chunks = ichunker((group for _, group in grouped_seqs),
                      args.rlefile, args.min_clust_size, args.max_clust_size)

    # calculate consensus for each cluster, then accumulate names of
    # each set of identical consensus sequences in `exemplars` (keys
    # are the consensus sequences themselves).
    exemplars = defaultdict(list)
    pool = Pool(processes = args.threads)
    for cluster, cons in map(align_and_consensus, enumerate(chunks, start = 1)):
        exemplars[cons].extend([c.id for c in cluster])

    # calculate ratios of reads for the smallest group to each of the
    # other groups. outseqs is a list of (weight, consensus, list_of_names)
    if args.groups and exemplars:
        groups = dict(csv.reader(args.groups))
        group_counts = Counter(groups[name] for name in chain.from_iterable(exemplars.values()))
        most_common = group_counts.most_common()
        _, least_common = most_common[-1]
        weights = {k: float(least_common)/v for k, v in most_common}
        outseqs = [(sum(weights[groups[n]] for n in names), cons, names)
                   for cons, names in exemplars.items()]
    else:
        outseqs = [(len(names), cons, names) for cons, names in exemplars.items()]

    # write each consensus sequence in descending order of weight
    outseqs.sort(reverse=True, key=itemgetter(0))
    for i, (weight, cons, names) in enumerate(outseqs, start=1):

        name_elements = [args.name_prefix,
                         'cons{:04}'.format(i),
                         '{:.0f}'.format(weight),
                         args.name_suffix]

        consname = args.name_delimiter.join([e for e in name_elements if e])

        log.debug('writing {}'.format(consname))

        args.outfile.write('>{}\n{}\n'.format(consname, cons))

        if args.readmap:
            args.readmap.writerows((name, consname) for name in names)

        if args.clustermap and args.specimen:
            args.clustermap.writerow((consname, args.specimen))

        if args.weights:
            args.weights.writerow((consname, weight))
Ejemplo n.º 36
0
def action(args):

    if args.remote and not args.remote_database:
        log.error("bioy blast: error: please specify a remote database")
        return
    elif not args.remote and not args.database:
        log.error("bioy blast: error: please specify path to local database")
        return

    command = ['blastn']
    command += ['-query', args.fasta]
    if args.remote:
        command += ['-remote']
        command += ['-db', args.remote_database]
    else:
        command += ['-db', args.database]
        command += ['-num_threads', str(args.threads)]
    command += ['-perc_identity', args.id]
    command += ['-outfmt', '6 ' + args.outfmt.replace(',', ' ')]
    command += ['-strand', args.strand]

    if args.max:
        command += ['-max_target_seqs', args.max]

    log.info(' '.join(command))

    if args.dry_run:
        sys.exit(0)

    pipe = Popen(command, stdout = PIPE, stderr = PIPE)

    results, errors = pipe.communicate()

    if errors:
       log.error(errors)

    # split tab lines
    lines = (r.strip().split('\t') for r in StringIO(results))

    header = args.outfmt.split(',')
    # match with fieldnames
    lines = (zip(header, l) for l in lines)

    # make into dict
    lines = [dict(l) for l in lines]

    # Replace blast's local alignment query coverage with global coverage calculation
    if 'qcovs' in args.outfmt.split(',') or isinstance(args.coverage, float):
        for l in lines:
            l['qcovs'] = (float(l['qend']) - float(l['qstart']) + 1) \
                    / float(l['qlen']) * 100
            l['qcovs'] = '{0:.2f}'.format(l['qcovs'])
    if isinstance(args.coverage, float):
        lines = [l for l in lines if float(l['qcovs']) >= args.coverage]

    if args.nohits:
        # to get nohits first we need to know about the hits
        qids = groupby(lines, key = itemgetter('qseqid'))
        qids = set(q for q,_ in qids)

        # now we can build a list of nohits
        nohits = []
        for q in fastalite(opener(args.fasta)):
            if q.id not in qids:
                nohits.append(q)

        # convert nohits into DictWriter format
        nohits = (dict(qseqid = q.id) for q in nohits)

        # append to lines
        lines = chain(lines, nohits)

    out = DictWriter(args.out,
                     fieldnames = header,
                     extrasaction = 'ignore')

    if args.header:
        out.writeheader()

    out.writerows(lines)
Ejemplo n.º 37
0
 def test02(self):
     with open(self.data('five.fasta')) as f:
         seqs = sequtils.fastalite(f)
         for seq in seqs:
             pass
Ejemplo n.º 38
0
def action(args):
    seqs = fastalite(args.sequences)

    # sort seqs by group information
    if args.split_info:
        primary, secondary = args.primary_group, args.secondary_group
        info_reader = csv.DictReader(args.split_info)
        info = {r['seqname']: r for r in info_reader}

        # group tag sequences if info_file exists
        def group_tag(seq):
            i = info[seq.id]
            group = i[primary] or i[secondary] if secondary else i[primary]
            return dict(group = group, seq = seq)

        seqs = (group_tag(s) for s in seqs)

        # group the sequences by tags
        seqs = sorted(seqs, key = itemgetter('group'))
        seqs = groupby(seqs, key = itemgetter('group'))

        # just need the seqs
        seqs = ((pair['seq'] for pair in group) for _,group in seqs)
    else:
        seqs = (seqs,)

    # set up output files
    if args.out_info and args.split_info:
        info_out = csv.DictWriter(args.out_info, fieldnames=info_reader.fieldnames)
        info_out.writeheader()

    if args.out_map:
        map_out = csv.DictWriter(args.out_map, fieldnames = ['kept', 'orig'])

    if args.out_weights:
        weights_out = csv.DictWriter(args.out_weights, fieldnames = ['kept', 'kept', 'weight'])

    # dedup seqs by groups
    for group in seqs:
        weights = Counter()
        deduped = {}

        for orig in group:
            # checksums are faster to manage
            clean = orig.seq.replace('\n', '').upper()
            checksum = hashlib.sha1(clean).hexdigest()

            if checksum in deduped:
                kept = deduped[checksum]
            else:
                kept = deduped[checksum] = orig
                args.out.write('>{}\n{}\n'.format(kept.description, kept.seq))

                if args.out_info and args.split_info:
                    info_out.writerow(info[kept.id])

            if args.out_weights:
                weights[kept.id] += 1

            if args.out_map:
                map_out.writerow(dict(kept=kept.id, orig=orig.id))

        for kept_id,count in weights.items():
            weights_out.writerow(dict(kept=kept_id, weight=count))
Ejemplo n.º 39
0
def action(args):
    seqs = fastalite(args.sequences)

    # sort seqs by group information
    if args.split_info:
        primary, secondary = args.primary_group, args.secondary_group
        info_reader = csv.DictReader(args.split_info)
        info = {r['seqname']: r for r in info_reader}

        # group tag sequences if info_file exists
        def group_tag(seq):
            i = info[seq.id]
            group = i[primary] or i[secondary] if secondary else i[primary]
            return dict(group=group, seq=seq)

        seqs = (group_tag(s) for s in seqs)

        # group the sequences by tags
        seqs = sorted(seqs, key=itemgetter('group'))
        seqs = groupby(seqs, key=itemgetter('group'))

        # just need the seqs
        seqs = ((pair['seq'] for pair in group) for _, group in seqs)
    else:
        seqs = (seqs, )

    # set up output files
    if args.out_info and args.split_info:
        info_out = csv.DictWriter(args.out_info,
                                  fieldnames=info_reader.fieldnames)
        info_out.writeheader()

    if args.out_map:
        map_out = csv.DictWriter(args.out_map, fieldnames=['kept', 'orig'])

    if args.out_weights:
        weights_out = csv.DictWriter(args.out_weights,
                                     fieldnames=['kept', 'kept', 'weight'])

    # dedup seqs by groups
    for group in seqs:
        weights = Counter()
        deduped = {}

        for orig in group:
            # checksums are faster to manage
            clean = orig.seq.replace('\n', '').upper()
            checksum = hashlib.sha1(clean).hexdigest()

            if checksum in deduped:
                kept = deduped[checksum]
            else:
                kept = deduped[checksum] = orig
                args.out.write('>{}\n{}\n'.format(kept.description, kept.seq))

                if args.out_info and args.split_info:
                    info_out.writerow(info[kept.id])

            if args.out_weights:
                weights[kept.id] += 1

            if args.out_map:
                map_out.writerow(dict(kept=kept.id, orig=orig.id))

        for kept_id, count in weights.items():
            weights_out.writerow(dict(kept=kept_id, weight=count))