Example #1
0
def action(args):
    # remove .ssi index for sequence file if it exists
    try:
        os.remove(args.seqs + '.ssi')
    except OSError:
        pass

    dtype = {'gi': str, 'tax_id': str, 'species': str}
    seq_info = pd.read_csv(args.seq_info, dtype=dtype)
    info_cols = seq_info.columns

    if args.include:
        include = args.include.read().split()
        seq_info = seq_info.loc[seq_info[args.group_on].isin(include)]

    # join with taxonomy if provided
    if args.taxonomy:
        tax = pd.read_csv(args.taxonomy, dtype=str,).set_index('tax_id')
        seq_info = seq_info.join(tax, on='tax_id')

    grouped = seq_info.groupby(args.group_on, sort=False)

    frames = []
    for key, grp in grouped:
        # don't cluster groups represented by only one seq
        if grp.shape[0] == 1:
            clusters = mocked_cluster_output(grp['seqname'])
        else:
            # TODO: is longer necessarily better?
            grp = grp.sort_values(
                by=['is_type', 'ambig_count', 'length'],
                ascending=[False, True, False])
            clusters = cluster(args.seqs, grp['seqname'], identity=args.id,
                               prefix='{}-'.format(key), threads=args.threads)

        clusters['group'] = key
        frames.append(clusters)

    all_clusters = pd.concat(frames)

    if args.derep_map_out:
        all_clusters.columns = ['type', 'seqname', 'seed', 'group']
        all_clusters.to_csv(args.derep_map_out, header=True, index=False)

    if args.seq_info_out:
        seq_info = seq_info[seq_info['seqname'].isin(all_clusters['seed'])]
        seq_info.to_csv(args.seq_info_out, columns=info_cols, index=False)

    wrap.esl_sfetch(
        args.seqs, all_clusters['seed'].unique(), args.seqs_out)

    # finally - clean up .ssi file
    os.remove(args.seqs + '.ssi')
Example #2
0
def cluster(seqfile, seqnames, identity=1.0, prefix='cluster-', threads=None):
    with util.ntf(prefix=prefix, suffix='.fasta') as fa, \
            util.ntf(prefix=prefix, suffix='.uc') as uc:
        wrap.esl_sfetch(seqfile, seqnames, fa)
        fa.flush()
        uclust.cluster(fa.name,
                       uc.name,
                       pct_id=identity,
                       pre_sorted=False,
                       quiet=True,
                       threads=threads)
        df = uclust.parse_uclust_as_df(uc)
        df = df[df.type != 'C']
        df = df[['type', 'query_label', 'target_label']]

        return df
Example #3
0
def cluster(seqfile, seqnames, identity=1.0, prefix='cluster-', threads=None):
    prefix = prefix.replace('/', '\\')  # / confuses the filesystem
    with util.ntf(prefix=prefix, suffix='.fasta') as fa, \
            util.ntf(prefix=prefix, suffix='.uc') as uc:
        wrap.esl_sfetch(seqfile, seqnames, fa)
        fa.flush()
        uclust.cluster(fa.name,
                       uc.name,
                       pct_id=identity,
                       pre_sorted=False,
                       quiet=True,
                       threads=threads)
        df = uclust.parse_uclust_as_df(uc)
        df = df[df.type != 'C']
        df = df[['type', 'query_label', 'target_label']]

        return df
Example #4
0
def action(args):
    # remove .ssi index for sequence file if it exists
    try:
        os.remove(args.seqs + '.ssi')
    except OSError:
        pass

    dtype = {'gi': str, 'tax_id': str, 'species': str}
    seq_info = pd.read_csv(args.seq_info, dtype=dtype)
    info_cols = seq_info.columns

    if args.include:
        include = args.include.read().split()
        seq_info = seq_info.loc[seq_info[args.group_on].isin(include)]

    # join with taxonomy if provided
    if args.taxonomy:
        tax = pd.read_csv(args.taxonomy, dtype=str,).set_index('tax_id')
        seq_info = seq_info.join(tax, on='tax_id')

    grouped = seq_info.groupby(args.group_on, sort=False)

    frames = []
    for key, grp in grouped:
        # don't cluster groups represented by only one seq
        if grp.shape[0] == 1:
            clusters = mocked_cluster_output(grp['seqname'])
        else:
            # TODO: is longer necessarily better?
            by = []
            ascending = []
            for c, o in [['is_type', False],
                         ['ambig_count', True],
                         ['length', False]]:
                if c in grp.columns:
                    by.append(c)
                    ascending.append(o)
            if by:
                grp = grp.sort_values(
                    by=by,
                    ascending=ascending)
                clusters = cluster(
                    args.seqs, grp['seqname'], identity=args.id,
                    prefix='{}-'.format(key), threads=args.threads)

        clusters['group'] = key
        frames.append(clusters)

    all_clusters = pd.concat(frames)

    if args.derep_map_out:
        all_clusters.columns = ['type', 'seqname', 'seed', 'group']
        all_clusters.to_csv(args.derep_map_out, header=True, index=False)

    if args.seq_info_out:
        seq_info = seq_info[seq_info['seqname'].isin(all_clusters['seed'])]
        seq_info.to_csv(args.seq_info_out, columns=info_cols, index=False)

    wrap.esl_sfetch(
        args.seqs, all_clusters['seed'].unique(), args.seqs_out)

    # finally - clean up .ssi file
    os.remove(args.seqs + '.ssi')