Esempio n. 1
0
def build_parser(parser):
    parser.add_argument('seqs', help="""Named sequences""")
    parser.add_argument('seq_info', help="""Sequence info file""")
    parser.add_argument('-t', '--taxonomy',
                        help="""Taxonomy as taxtable; optional
                        if a grouping term is available in seq_info""")

    parser.add_argument('--seq-info-out',
                        help='subset of original seq_info')
    parser.add_argument('--derep-map-out',
                        help=('mapping of input sequences to dereplicated '
                              'representatives. `group` column corresponds to '
                              'the field identified by --group-on.'))
    parser.add_argument('--seqs-out',
                        default=sys.stdout,
                        type=util.file_opener('w'))

    parser.add_argument('-g', '--group-on', default='species',
                        help='Field in seq_info on which to group sequences')
    parser.add_argument('--id', default=1.0,
                        type=float, help="""Clustering identity between 0 and 1
                        [default: %(default).3f]""")
    parser.add_argument('-i', '--include', type=util.file_opener('r'),
                        help=('Optional file containing list '
                              'of group labels to include'))
    parser.add_argument('--threads',
                        help=('Number of threads to use for clustering each '
                              'group [default is one thread per '
                              'available CPU core]'))
Esempio n. 2
0
def build_parser(p):
    p.add_argument('fasta_file',
                   help="""sequence file""",
                   type=file_opener('r'))
    p.add_argument('seqinfo_file',
                   help="""Sequence metadata""",
                   type=file_opener('r'))
    p.add_argument('--named-seqs',
                   default='named.seqs.fasta',
                   help='[default %(default)s]')
    p.add_argument('--named-info',
                   default='named.seq_info.csv',
                   help='[default %(default)s]')
    p.add_argument('--unnamed-seqs',
                   default='unnamed.seqs.fasta',
                   help='[default %(default)s]')
    p.add_argument('--unnamed-info',
                   default='unnamed.seq_info.csv',
                   help='[default %(default)s]')

    flt = p.add_argument_group('Filtering options')
    flt.add_argument(
        '-a',
        '--prop-ambig-cutoff',
        default=0.01,
        type=float,
        help="""Maximum proportion of characters in sequence which may be
                             ambiguous [default: %(default).2f]""")
    flt.add_argument('-l',
                     '--min-length',
                     type=int,
                     help="""Minimum sequence
            length [default: %(default)d]""",
                     default=1200)
Esempio n. 3
0
def build_parser(p):
    p.add_argument('infile', type=util.file_opener('r'),
                   help="""Input file, gzipped""")
    p.add_argument('database', help="""Path to taxonomy database""")
    p.add_argument('fasta_out', type=util.file_opener('w'),
                   help="""Path to write sequences in FASTA format.
                           Specify '.gz' or '.bz2' extension to compress.""")
    p.add_argument('output', metavar='tax_out', type=argparse.FileType('w'),
                   help="""Output path to write taxonomic
                           information in CSV format""")
    p.add_argument('--no-header', action='store_false', dest='header',
                   default=True, help="""Don't write a header""")
Esempio n. 4
0
def build_parser(p):
    p.add_argument('fasta_file', help="""sequence file""", type=file_opener('r'))
    p.add_argument('seqinfo_file', help="""Sequence metadata""", type=file_opener('r'))
    p.add_argument('named_base')
    p.add_argument('unnamed_base')

    flt = p.add_argument_group('Filtering options')
    flt.add_argument('-a', '--prop-ambig-cutoff', default=0.01, type=float,
            help="""Maximum proportion of characters in sequence which may be
            ambiguous [default: %(default).2f]""")
    flt.add_argument('-l', '--min-length', type=int, help="""Minimum sequence
            length [default: %(default)d]""", default=1200)
Esempio n. 5
0
def build_parser(p):
    p.add_argument('infile',
                   type=util.file_opener('r'),
                   help="""Input file, gzipped""")
    p.add_argument('database', help="""Path to taxonomy database""")
    p.add_argument('fasta_out',
                   type=util.file_opener('w'),
                   help="""Path to write sequences in FASTA format.
                           Specify '.gz' or '.bz2' extension to compress.""")
    p.add_argument('output',
                   metavar='tax_out',
                   type=argparse.FileType('w'),
                   help="""Output path to write taxonomic
                           information in CSV format""")
    p.add_argument('--no-header',
                   action='store_false',
                   dest='header',
                   default=True,
                   help="""Don't write a header""")
Esempio n. 6
0
def action(args):
    dtype = {'gi': str, 'tax_id': str, 'species': str}
    seq_info = pandas.read_csv(args.seq_info, dtype=dtype, index_col='seqname')

    log.info('reading sequences')
    with util.file_opener()(args.sequences) as sequences_in:
        seqhashes = dict()
        for record in util.Counter(SeqIO.parse(sequences_in, 'fasta')):
            seq = str(record.seq).replace('\n', '').upper()
            seqhashes[record.name] = hashlib.sha1(seq).hexdigest()

    seqhash = pandas.Series(data=seqhashes, name='seqhash')
    seqhash.index.name = 'seqname'
    seq_info = seq_info.join(seqhash)

    group_by = ['seqhash']
    if args.group_by:
        group_by.extend(args.group_by.split(','))

    def choose_rep(df):
        if args.prefer_columns:
            df = df.sort_values(by=args.prefer_columns.split(','))
        rep = df.tail(1)
        rep['weight'] = len(df)
        return rep

    log.info('choosing seq_info representatives')
    seq_info = seq_info.groupby(
        by=group_by, group_keys=False).apply(choose_rep)
    seq_info = seq_info.drop('seqhash', axis=1)

    log.info('writing seqinfo')
    seq_info.to_csv(args.out_info, quoting=csv.QUOTE_NONNUMERIC)

    log.info('writing dedup file')
    with util.file_opener()(args.sequences) as sequences_in, \
            util.file_opener('w')(args.out) as sequences_out:
        for record in util.Counter(SeqIO.parse(sequences_in, 'fasta')):
            if record.name in seq_info.index:
                fasta_out = '>{}\n{}\n'.format(record.name, str(record.seq))
                sequences_out.write(fasta_out)
Esempio n. 7
0
def action(args):
    seq_info = pandas.read_csv(args.seq_info, dtype=str, index_col='seqname')

    log.info('reading sequences')
    with util.file_opener()(args.sequences) as sequences_in:
        seqhashes = dict()
        for record in util.Counter(SeqIO.parse(sequences_in, 'fasta')):
            seq = str(record.seq).replace('\n', '').upper()
            seqhashes[record.name] = hashlib.sha1(seq).hexdigest()

    seqhash = pandas.Series(data=seqhashes, name='seqhash')
    seqhash.index.name = 'seqname'
    seq_info = seq_info.join(seqhash)

    group_by = ['seqhash']
    if args.group_by:
        group_by.extend(args.group_by.split(','))

    def choose_rep(df):
        if args.prefer_columns:
            df = df.sort_values(by=args.prefer_columns.split(','))
        rep = df.tail(1)
        rep['weight'] = len(df)
        return rep

    log.info('choosing seq_info representatives')
    seq_info = seq_info.groupby(
        by=group_by, group_keys=False).apply(choose_rep)
    seq_info = seq_info.drop('seqhash', axis=1)

    log.info('writing seqinfo')
    seq_info.to_csv(args.out_info, quoting=csv.QUOTE_NONNUMERIC)

    log.info('writing dedup file')
    with util.file_opener()(args.sequences) as sequences_in, \
            util.file_opener('w')(args.out) as sequences_out:
        for record in util.Counter(SeqIO.parse(sequences_in, 'fasta')):
            if record.name in seq_info.index:
                fasta_out = '>{}\n{}\n'.format(record.name, str(record.seq))
                sequences_out.write(fasta_out)
Esempio n. 8
0
def build_parser(p):
    # inputs
    p.add_argument(
        'fasta',
        metavar='FASTA',
        help="""sequence file""",
        type=util.file_opener('r'))
    p.add_argument(
        'seqinfo',
        metavar='CSV',
        help="""Sequence metadata""")
    p.add_argument(
        '--references',
        metavar='CSV',
        help='csv file with columns: version,pubmed_id')

    # outputs
    info_outs = p.add_argument_group('outputs for seq_info\'s')
    info_outs.add_argument(
        '--named-info',
        metavar='CSV',
        type=util.file_opener('w'),
        help='taxid_classified column is True')
    info_outs.add_argument(
        '--unnamed-info',
        metavar='CSV',
        type=util.file_opener('w'),
        help='taxid_classified column is False')
    info_outs.add_argument(
        '--type-info',
        type=util.file_opener('w'),
        metavar='CSV',
        help='rows where is_type column is True')
    info_outs.add_argument(
        '--published-info',
        type=util.file_opener('w'),
        metavar='CSV',
        help="""requires references.csv. Return seq_info with pubmed_ids""")
    info_outs.add_argument(
        '--references-info',
        type=util.file_opener('w'),
        metavar='CSV',
        help=('requires references.csv. '
              'Return columns [version, accession, pubmed_id]'))

    seq_outs = p.add_argument_group('outputs for sequences')
    seq_outs.add_argument(
        '--named-seqs',
        type=util.file_opener('w'),
        metavar='FASTA',
        help='where taxid_classified column is True')
    seq_outs.add_argument(
        '--unnamed-seqs',
        type=util.file_opener('w'),
        metavar='FASTA',
        help='where taxid_classified column is False')
    seq_outs.add_argument(
        '--type-seqs',
        type=util.file_opener('w'),
        metavar='FASTA',
        help='where is_type column is True')
    seq_outs.add_argument(
        '--published-seqs',
        type=util.file_opener('w'),
        metavar='FASTA',
        help="""requires references.csv. Return sequences with pubmed_ids""")

    # filtering switches
    flt = p.add_argument_group('filtering options')
    flt.add_argument('-a', '--prop-ambig-cutoff',
                     type=float,
                     help=('Maximum proportion of characters in '
                           'sequence which may be ambiguous'))
    flt.add_argument('-l', '--min-length',
                     type=int,
                     help='Minimum sequence length')