Example #1
0
def action(args):
    logging.info("Loading taxtable")
    with args.search_taxtable as fp:
        full_taxonomy = taxtable.read(fp)

    logging.info("Loading chosen sequence metadata")
    chosen_taxonomy = copy.deepcopy(full_taxonomy)
    chosen_taxonomy.populate_from_seqinfo(args.chosen_seqinfo)
    chosen_taxonomy.prune_unrepresented()

    logging.info("loading full sequence metadata")
    full_taxonomy.populate_from_seqinfo(args.search_seqinfo)

    # Find lonely
    nodes = [i for i in chosen_taxonomy if i.rank == args.lonely_rank]
    lonely_nodes = [i for i in nodes if is_lonely(i)]
    additional_reps = set()
    futs = []
    with futures.ThreadPoolExecutor(args.threads) as executor:
        for node in lonely_nodes:
            futs.append(executor.submit(fill_lonely_worker,
                node.tax_id,
                node.at_rank(args.parent_rank).tax_id, full_taxonomy, args.search_fasta,
                n_reps=args.number_of_reps))
        while futs:
            try:
                done, pending = futures.wait(futs, 1, futures.FIRST_COMPLETED)
                futs = set(pending)
                for f in done:
                    if f.exception():
                        raise f.exception()
                    additional_reps |= f.result()
                sys.stderr.write("{0:6d}/{1:6d} complete        \r".format(len(lonely_nodes) - len(pending),
                        len(lonely_nodes)))
            except futures.TimeoutError:
                pass # Keep waiting
            except:
                logging.exception("Caught error in child thread - exiting")
                executor.shutdown(False)
                raise

    logging.info("%d additional references", len(additional_reps))
    with open(args.chosen_fasta) as fp, args.output as ofp:
        shutil.copyfileobj(fp, ofp)
        wrap.esl_sfetch(args.search_fasta, additional_reps, ofp)

    with args.chosen_seqinfo as fp, args.output_seqinfo as ofp, \
            args.search_seqinfo as sub_fp:
        fp.seek(0)
        r = csv.DictReader(fp)
        w = csv.DictWriter(ofp, r.fieldnames, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n')
        w.writeheader()
        w.writerows(r)

        args.search_seqinfo.seek(0)
        for row in csv.DictReader(sub_fp):
            if row['seqname'] in additional_reps:
                w.writerow(row)
Example #2
0
def action(args):
    with args.taxtable as fp:
        tax = taxtable.read(fp)

    with args.extra_nodes_csv:
        reader = csv.DictReader(args.extra_nodes_csv)
        missing_fields = frozenset(['tax_id', 'tax_name', 'rank', 'parent_id'
                                    ]) - frozenset(reader.fieldnames)
        if missing_fields:
            raise IOError("Missing expected fields: {0}".format(
                ','.join(missing_fields)))
        for row in reader:
            if row['tax_id'] in tax.index:
                logging.warn(
                    "tax_id %s already represented in taxtable. [row %d]",
                    row['tax_id'], reader.line_num)
                continue

            parent_id = row['parent_id']
            rank = row['rank']
            try:
                parent_node = tax.get_node(parent_id)
            except ValueError:
                raise ValueError(
                    "Parent {parent_id} of {tax_id}[{tax_name}] not found.".
                    format(**row))
            if rank not in tax.ranks:
                add_rank(tax, parent_node, rank)
            node = taxtable.TaxNode(tax_id=row['tax_id'],
                                    name=row['tax_name'],
                                    rank=rank)
            parent_node.add_child(node)
            logging.info("Added %s %s[%s] below %s %s[%s]", node.rank,
                         node.tax_id, node.name, parent_node.rank,
                         parent_node.tax_id, parent_node.name)

    tax.write_taxtable(args.out_file)

    return 0
Example #3
0
def action(args):
    with args.taxtable as fp:
        tax = taxtable.read(fp)

    with args.extra_nodes_csv:
        reader = csv.DictReader(args.extra_nodes_csv)
        missing_fields = frozenset(
            ['tax_id', 'tax_name', 'rank', 'parent_id']) - frozenset(reader.fieldnames)
        if missing_fields:
            raise IOError("Missing expected fields: {0}".format(
                ','.join(missing_fields)))
        for row in reader:
            if row['tax_id'] in tax.index:
                logging.warn("tax_id %s already represented in taxtable. [row %d]",
                             row['tax_id'], reader.line_num)
                continue

            parent_id = row['parent_id']
            rank = row['rank']
            try:
                parent_node = tax.get_node(parent_id)
            except ValueError:
                raise ValueError(
                    "Parent {parent_id} of {tax_id}[{tax_name}] not found.".format(**row))
            if rank not in tax.ranks:
                add_rank(tax, parent_node, rank)
            node = taxtable.TaxNode(
                tax_id=row['tax_id'], name=row['tax_name'], rank=rank)
            parent_node.add_child(node)
            logging.info(
                "Added %s %s[%s] below %s %s[%s]",
                node.rank, node.tax_id, node.name,
                parent_node.rank, parent_node.tax_id, parent_node.name)

    tax.write_taxtable(args.out_file)

    return 0
Example #4
0
def add_clusters_to_refpkg(refpkg, **kwargs):
    with refpkg.open_resource('taxonomy') as tax_fp:
        tax = taxtable.read(tax_fp)
    with refpkg.open_resource('seq_info') as sinfo_fp:
        reader = csv.DictReader(sinfo_fp)
        sinfo = list(reader)

    # Annotate
    add_cluster_taxids(tax, sinfo, **kwargs)

    with util.ntf(prefix='seq_info-', suffix='.csv') as seqinfo_tf, \
         util.ntf(prefix='taxonomy-', suffix='.csv') as tax_tf:
        w = csv.DictWriter(seqinfo_tf, reader.fieldnames)
        w.writeheader()
        w.writerows(sinfo)
        seqinfo_tf.close()

        tax.write_taxtable(tax_tf)
        tax_tf.close()

        refpkg.start_transaction()
        refpkg.update_file('seq_info', seqinfo_tf.name)
        refpkg.update_file('taxonomy', tax_tf.name)
        refpkg.commit_transaction()
Example #5
0
def action(args):
    log_writer = None
    if args.log:
        log_writer = csv.DictWriter(args.log, ['seqname', 'orig_tax_id',
            'renamed_tax_id', 'renamed_tax_name', 'best_hit', 'pct_id',
            'applied'], quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n')
        log_writer.writeheader()

    # Load all tax_ids
    with args.taxtable as fp:
        new_tax = taxtable.read(fp)

    with args.seq_info as fp:
        new_seq_info = {row['seqname']: row for row in csv.DictReader(fp)}

    with args.refpkg.open_resource('aln_fasta') as fp:
        ref_sequences = [ungap(i) for i in SeqIO.parse(fp, 'fasta')]
    with args.refpkg.open_resource('seq_info') as fp:
        ref_seq_info_reader = csv.DictReader(fp)
        ref_seq_info = {row['seqname']: row for row in ref_seq_info_reader}
    with args.refpkg.open_resource('taxonomy') as fp:
        ref_taxonomy = taxtable.read(fp)

    search = functools.partial(uclust.search,
            pct_id=args.percent_id,
            search_pct_id=0.9, quiet=True)

    # Search the sequences from the reference package against the input sequences
    with util.as_fasta(ref_sequences) as ref_fasta_path, util.ntf(prefix='uclust') as tf:
        search(args.fasta_file, ref_fasta_path, tf.name)
        input_records = uclust.parse_uclust_out(i for i in tf if i.startswith('H'))

        # Also search sequences from the reference package against themselves
        # TODO: decide if we want to use this
        #with util.ntf(prefix='uclust') as self_tf:
            #search(ref_fasta_path, ref_fasta_path, self_tf.name, maxaccepts=10)
            #ref_records = uclust.parse_uclust_out(i for i in self_tf if i.startswith('H'))
            ## Drop self-hits
            #ref_records = (i for i in ref_records if i.query_label != i.target_label)
            #grouped = itertools.groupby(ref_records, operator.attrgetter('query_label'))
            #best_hit_id = dict((g, max(i.pct_id for i in v)) for g, v in grouped)

        for record in input_records:

            ref_si = ref_seq_info[record.query_label]
            target_si = new_seq_info[record.target_label]
            #if record.pct_id > best_hit_id.get(record.query_label, 0.0):
            tax_id = target_si['tax_id']
            node = new_tax.get_node(tax_id)

            if log_writer:
                log_record = {'seqname': record.query_label,
                              'best_hit': record.target_label,
                              'pct_id': record.pct_id,
                              'orig_tax_id': ref_si['tax_id'],
                              'renamed_tax_id': node.tax_id,
                              'renamed_tax_name': node.name,
                              'applied': not ref_si['tax_id'] or args.conflict_action == 'replace'}
                log_writer.writerow(log_record)

            logging.info('Naming %s %s[%s,%s] based on %s (%.2f%%)', ref_si['seqname'],
                         node.name, node.tax_id, node.rank, record.target_label, record.pct_id)
            if ref_si['tax_id'] and ref_si['tax_id'] != tax_id:
                old_node = ref_taxonomy.get_node(ref_si['tax_id'])
                logging.warn('Already named: %s[%s,%s]%s',
                             old_node.name, old_node.tax_id, old_node.rank,
                             ' - replacing' if args.conflict_action == 'replace' else '')
            if not ref_si['tax_id'] or args.conflict_action == 'replace':
                ref_si['tax_id'] = target_si['tax_id']
                if tax_id not in ref_taxonomy.index:
                    add_to_taxonomy(ref_taxonomy, node)

    # Write updated taxtable, seqinfo
    with util.ntf(prefix='taxonomy-', suffix='.csv') as new_tax, \
         util.ntf(prefix='seq_info-', suffix='.csv') as new_seq_info:
        ref_taxonomy.write_taxtable(new_tax)
        new_tax.close()

        w = csv.DictWriter(new_seq_info, ref_seq_info_reader.fieldnames)
        w.writeheader()
        w.writerows(ref_seq_info.values())
        new_seq_info.close()

        args.refpkg.start_transaction()
        args.refpkg.update_file('taxonomy', new_tax.name)
        args.refpkg.update_file('seq_info', new_seq_info.name)
        args.refpkg.commit_transaction()
Example #6
0
def action(args):
    logging.info("Loading taxtable")
    with args.search_taxtable as fp:
        full_taxonomy = taxtable.read(fp)

    logging.info("Loading chosen sequence metadata")
    chosen_taxonomy = copy.deepcopy(full_taxonomy)
    chosen_taxonomy.populate_from_seqinfo(args.chosen_seqinfo)
    chosen_taxonomy.prune_unrepresented()

    logging.info("loading full sequence metadata")
    full_taxonomy.populate_from_seqinfo(args.search_seqinfo)

    if args.exclude_taxids:
        for e in args.exclude_taxids:
            e = e.strip()
            logging.info('ignoring tax_id {}'.format(e))
            full_taxonomy.get_node(e).remove_subtree()

    # Find lonely
    nodes = [i for i in chosen_taxonomy if i.rank == args.lonely_rank]
    lonely_nodes = [i for i in nodes if is_lonely(i)]
    additional_reps = set()
    futs = []
    with futures.ThreadPoolExecutor(args.threads) as executor:
        for node in lonely_nodes:
            futs.append(
                executor.submit(fill_lonely_worker,
                                node.tax_id,
                                node.at_rank(args.parent_rank).tax_id,
                                full_taxonomy,
                                args.search_fasta,
                                n_reps=args.number_of_reps))

        while futs:
            try:
                done, pending = futures.wait(futs, 1, futures.FIRST_COMPLETED)
                futs = set(pending)
                for f in done:
                    if f.exception():
                        raise f.exception()
                    additional_reps |= f.result()
                sys.stderr.write("{0:6d}/{1:6d} complete        \r".format(
                    len(lonely_nodes) - len(pending), len(lonely_nodes)))
            except futures.TimeoutError:
                pass  # Keep waiting
            except:
                logging.exception("Caught error in child thread - exiting")
                executor.shutdown(False)
                raise

    if args.include_taxids:
        for t in args.include_taxids:
            t = t.strip()
            logging.info('including tax_id {}'.format(t))
            for s in set(full_taxonomy.get_node(t).subtree_sequence_ids()):
                logging.info('sequence {}'.format(s))
                additional_reps.add(s)

    logging.info("%d additional references", len(additional_reps))
    with open(args.chosen_fasta) as fp, args.output as ofp:
        shutil.copyfileobj(fp, ofp)
        wrap.esl_sfetch(args.search_fasta, additional_reps, ofp)

    with args.chosen_seqinfo as fp, args.output_seqinfo as ofp, \
            args.search_seqinfo as sub_fp:
        fp.seek(0)
        r = csv.DictReader(fp)
        w = csv.DictWriter(ofp,
                           r.fieldnames,
                           quoting=csv.QUOTE_NONNUMERIC,
                           lineterminator='\n')
        w.writeheader()
        w.writerows(r)

        args.search_seqinfo.seek(0)
        for row in csv.DictReader(sub_fp):
            if row['seqname'] in additional_reps:
                w.writerow(row)
Example #7
0
def action(args):
    log_writer = None
    if args.log:
        log_writer = csv.DictWriter(args.log, [
            'seqname', 'orig_tax_id', 'renamed_tax_id', 'renamed_tax_name',
            'best_hit', 'pct_id', 'applied'
        ],
                                    quoting=csv.QUOTE_NONNUMERIC,
                                    lineterminator='\n')
        log_writer.writeheader()

    # Load all tax_ids
    with args.taxtable as fp:
        new_tax = taxtable.read(fp)

    with args.seq_info as fp:
        new_seq_info = {row['seqname']: row for row in csv.DictReader(fp)}

    with args.refpkg.open_resource('aln_fasta') as fp:
        ref_sequences = [ungap(i) for i in SeqIO.parse(fp, 'fasta')]
    with args.refpkg.open_resource('seq_info') as fp:
        ref_seq_info_reader = csv.DictReader(fp)
        ref_seq_info = {row['seqname']: row for row in ref_seq_info_reader}
    with args.refpkg.open_resource('taxonomy') as fp:
        ref_taxonomy = taxtable.read(fp)

    search = functools.partial(uclust.search,
                               pct_id=args.percent_id,
                               search_pct_id=0.9,
                               quiet=True)

    # Search the sequences from the reference package against the input sequences
    with util.as_fasta(ref_sequences) as ref_fasta_path, util.ntf(
            prefix='uclust') as tf:
        search(args.fasta_file, ref_fasta_path, tf.name)
        input_records = uclust.parse_uclust_out(i for i in tf
                                                if i.startswith('H'))

        # Also search sequences from the reference package against themselves
        # TODO: decide if we want to use this
        #with util.ntf(prefix='uclust') as self_tf:
        #search(ref_fasta_path, ref_fasta_path, self_tf.name, maxaccepts=10)
        #ref_records = uclust.parse_uclust_out(i for i in self_tf if i.startswith('H'))
        ## Drop self-hits
        #ref_records = (i for i in ref_records if i.query_label != i.target_label)
        #grouped = itertools.groupby(ref_records, operator.attrgetter('query_label'))
        #best_hit_id = dict((g, max(i.pct_id for i in v)) for g, v in grouped)

        for record in input_records:

            ref_si = ref_seq_info[record.query_label]
            target_si = new_seq_info[record.target_label]
            #if record.pct_id > best_hit_id.get(record.query_label, 0.0):
            tax_id = target_si['tax_id']
            node = new_tax.get_node(tax_id)

            if log_writer:
                log_record = {
                    'seqname':
                    record.query_label,
                    'best_hit':
                    record.target_label,
                    'pct_id':
                    record.pct_id,
                    'orig_tax_id':
                    ref_si['tax_id'],
                    'renamed_tax_id':
                    node.tax_id,
                    'renamed_tax_name':
                    node.name,
                    'applied':
                    not ref_si['tax_id'] or args.conflict_action == 'replace'
                }
                log_writer.writerow(log_record)

            logging.info('Naming %s %s[%s,%s] based on %s (%.2f%%)',
                         ref_si['seqname'], node.name, node.tax_id, node.rank,
                         record.target_label, record.pct_id)
            if ref_si['tax_id'] and ref_si['tax_id'] != tax_id:
                old_node = ref_taxonomy.get_node(ref_si['tax_id'])
                logging.warn(
                    'Already named: %s[%s,%s]%s', old_node.name,
                    old_node.tax_id, old_node.rank, ' - replacing'
                    if args.conflict_action == 'replace' else '')
            if not ref_si['tax_id'] or args.conflict_action == 'replace':
                ref_si['tax_id'] = target_si['tax_id']
                if tax_id not in ref_taxonomy.index:
                    add_to_taxonomy(ref_taxonomy, node)

    # Write updated taxtable, seqinfo
    with util.ntf(prefix='taxonomy-', suffix='.csv') as new_tax, \
         util.ntf(prefix='seq_info-', suffix='.csv') as new_seq_info:
        ref_taxonomy.write_taxtable(new_tax)
        new_tax.close()

        w = csv.DictWriter(new_seq_info, ref_seq_info_reader.fieldnames)
        w.writeheader()
        w.writerows(ref_seq_info.values())
        new_seq_info.close()

        args.refpkg.start_transaction()
        args.refpkg.update_file('taxonomy', new_tax.name)
        args.refpkg.update_file('seq_info', new_seq_info.name)
        args.refpkg.commit_transaction()