Beispiel #1
0
def build_index_refpkg(hrefpkg_paths, sequence_file, seqinfo, taxonomy,
        dest='index.refpkg', **meta):
    """
    Build an index.refpkg from a set of hrefpkgs
    """

    # Clear taxonomy
    taxonomy = copy.deepcopy(taxonomy)
    for node in taxonomy:
        node.sequence_ids = set()

    def sequence_names(f):
        with open(f) as fp:
            r = csv.DictReader(fp)
            for i in r:
                yield i['seqname']

    hrefpkgs = (Refpkg(i, create=False) for i in hrefpkg_paths)
    seqinfo_files = (i.open_resource('seq_info') for i in hrefpkgs)

    # Add seqinfo
    for f in seqinfo_files:
        with f:
            taxonomy.populate_from_seqinfo(f)

    # Remove lineages without sequences
    taxonomy.prune_unrepresented()

    sequence_ids = frozenset(taxonomy.subtree_sequence_ids())

    with util.ntf(prefix='aln_fasta', suffix='.fasta') as tf, \
         util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp, \
         util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp:
        wrap.esl_sfetch(sequence_file, sequence_ids, tf)
        tf.close()

        # Seqinfo file
        r = (i for i in seqinfo if i['seqname'] in sequence_ids)
        w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(), lineterminator='\n',
                quoting=csv.QUOTE_NONNUMERIC)
        w.writeheader()
        w.writerows(r)
        seq_info_fp.close()

        taxonomy.write_taxtable(tax_fp)
        tax_fp.close()

        rp = Refpkg(dest, create=True)
        rp.start_transaction()
        rp.update_file('aln_fasta', tf.name)
        rp.update_file('seq_info', seq_info_fp.name)
        rp.update_file('taxonomy', tax_fp.name)
        rp.update_file('profile', wrap.CM)

        for k, v in meta.items():
            rp.update_metadata(k, v)

        rp.commit_transaction()

    return rp, sequence_ids
Beispiel #2
0
def tax_id_refpkg(tax_id, full_tax, seqinfo, sequence_file,
        output_dir='.',
        index_rank='order', train_file=None, test_file=None):
    """
    Build a reference package containing all descendants of tax_id from an
    index reference package.
    """
    with util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp, \
         util.ntf(prefix='aln_sto', suffix='.sto') as sto_fp, \
         util.ntf(prefix='aln_fasta', suffix='.fasta') as fasta_fp, \
         util.ntf(prefix='tree', suffix='.tre') as tree_fp, \
         util.ntf(prefix='tree', suffix='.stats') as stats_fp, \
         util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp:

        # Subset taxonomy
        n = full_tax.get_node(tax_id)
        descendants = set(i.tax_id for i in n)
        assert descendants
        n.write_taxtable(tax_fp)
        tax_fp.close()

        # Subset seq_info
        w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(),
                quoting=csv.QUOTE_NONNUMERIC)
        w.writeheader()
        rows = [i for i in seqinfo if i['tax_id'] in descendants]
        sinfo = {i['seqname']: i for i in rows}

        # Choose sequences, divide into train and test sets
        chosen = choose_sequence_ids(n, rows, index_rank=index_rank)
        keep_seq_ids = set()
        train_seq_ids = set()
        test_seq_ids = set()

        for keep, rest in chosen:
            keep_seq_ids |= frozenset(keep)
            l = len(rest)
            if l >= 2 * PER_TAXON:
                train_seq_ids |= frozenset(rest[:l / 2])
                test_seq_ids |= frozenset(rest[l / 2:])

        # Picked
        rows = [sinfo[i] for i in keep_seq_ids]
        w.writerows(rows)
        seq_info_fp.close()

        # Fetch sequences
        with tempfile.NamedTemporaryFile() as tf:
            wrap.esl_sfetch(sequence_file,
                            keep_seq_ids, tf)
            # Rewind
            tf.seek(0)
            sequences = list(SeqIO.parse(tf, 'fasta'))
        logging.info("Tax id %s: %d sequences", tax_id, len(sequences))

        if len(set(str(i.seq) for i in sequences)) == 1:
            logging.warn("Skipping %s: only 1 unique sequence string", tax_id)
            return None

        # No sense in building with one sequence
        if len(sequences) < 2:
            logging.warn("Skipping: %d sequences.", len(sequences))
            return None

        # Extract training & test seqs
        if train_file:
            logging.info("%d training sequences", len(train_seq_ids))
            wrap.esl_sfetch(sequence_file, train_seq_ids, train_file)
        if test_file:
            logging.info("%d test sequences", len(test_seq_ids))
            wrap.esl_sfetch(sequence_file, test_seq_ids, test_file)

        # Cmalign
        aligned = wrap.cmalign(sequences, output=sto_fp)
        aligned = list(aligned)
        assert aligned
        # Tree
        wrap.fasttree(aligned, log_path=stats_fp.name, output_fp=tree_fp, threads=1, gtr=True)
        tree_fp.close()
        sto_fp.close()
        SeqIO.write(aligned, fasta_fp, 'fasta')
        fasta_fp.close()

        rp = Refpkg(os.path.join(output_dir, tax_id + '.refpkg'), create=True)
        rp.start_transaction()
        rp.update_file('aln_sto', sto_fp.name)
        rp.update_file('aln_fasta', fasta_fp.name)
        rp.update_file('tree', tree_fp.name)
        rp.update_file('seq_info', seq_info_fp.name)
        rp.update_file('taxonomy', tax_fp.name)
        try:
            rp.update_phylo_model('FastTree', stats_fp.name)
        except:
            print >> sys.stderr, stats_fp.read()
            raise
        rp.update_file('profile', wrap.CM)
        rp.commit_transaction()

        util.require_executable('rppr')
        rp.reroot()

        return rp.path
Beispiel #3
0
def tax_id_refpkg(tax_id,
                  full_tax,
                  seqinfo,
                  sequence_file,
                  output_dir='.',
                  index_rank='order',
                  train_file=None,
                  test_file=None):
    """
    Build a reference package containing all descendants of tax_id from an
    index reference package.
    """
    with util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp, \
         util.ntf(prefix='aln_sto', suffix='.sto') as sto_fp, \
         util.ntf(prefix='aln_fasta', suffix='.fasta') as fasta_fp, \
         util.ntf(prefix='tree', suffix='.tre') as tree_fp, \
         util.ntf(prefix='tree', suffix='.stats') as stats_fp, \
         util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp:

        # Subset taxonomy
        n = full_tax.get_node(tax_id)
        descendants = set(i.tax_id for i in n)
        assert descendants
        n.write_taxtable(tax_fp)
        tax_fp.close()

        # Subset seq_info
        w = csv.DictWriter(seq_info_fp,
                           seqinfo[0].keys(),
                           quoting=csv.QUOTE_NONNUMERIC)
        w.writeheader()
        rows = [i for i in seqinfo if i['tax_id'] in descendants]
        sinfo = {i['seqname']: i for i in rows}

        # Choose sequences, divide into train and test sets
        chosen = choose_sequence_ids(n, rows, index_rank=index_rank)
        keep_seq_ids = set()
        train_seq_ids = set()
        test_seq_ids = set()

        for keep, rest in chosen:
            keep_seq_ids |= frozenset(keep)
            l = len(rest)
            if l >= 2 * PER_TAXON:
                train_seq_ids |= frozenset(rest[:l / 2])
                test_seq_ids |= frozenset(rest[l / 2:])

        # Picked
        rows = [sinfo[i] for i in keep_seq_ids]
        w.writerows(rows)
        seq_info_fp.close()

        # Fetch sequences
        with tempfile.NamedTemporaryFile() as tf:
            wrap.esl_sfetch(sequence_file, keep_seq_ids, tf)
            # Rewind
            tf.seek(0)
            sequences = list(SeqIO.parse(tf, 'fasta'))
        logging.info("Tax id %s: %d sequences", tax_id, len(sequences))

        if len(set(str(i.seq) for i in sequences)) == 1:
            logging.warn("Skipping %s: only 1 unique sequence string", tax_id)
            return None

        # No sense in building with one sequence
        if len(sequences) < 2:
            logging.warn("Skipping: %d sequences.", len(sequences))
            return None

        # Extract training & test seqs
        if train_file:
            logging.info("%d training sequences", len(train_seq_ids))
            wrap.esl_sfetch(sequence_file, train_seq_ids, train_file)
        if test_file:
            logging.info("%d test sequences", len(test_seq_ids))
            wrap.esl_sfetch(sequence_file, test_seq_ids, test_file)

        # Cmalign
        aligned = wrap.cmalign(sequences, output=sto_fp)
        aligned = list(aligned)
        assert aligned
        # Tree
        wrap.fasttree(aligned,
                      log_path=stats_fp.name,
                      output_fp=tree_fp,
                      threads=1,
                      gtr=True)
        tree_fp.close()
        sto_fp.close()
        SeqIO.write(aligned, fasta_fp, 'fasta')
        fasta_fp.close()

        rp = Refpkg(os.path.join(output_dir, tax_id + '.refpkg'), create=True)
        rp.start_transaction()
        rp.update_file('aln_sto', sto_fp.name)
        rp.update_file('aln_fasta', fasta_fp.name)
        rp.update_file('tree', tree_fp.name)
        rp.update_file('seq_info', seq_info_fp.name)
        rp.update_file('taxonomy', tax_fp.name)
        try:
            rp.update_phylo_model('FastTree', stats_fp.name)
        except:
            print >> sys.stderr, stats_fp.read()
            raise
        rp.update_file('profile', wrap.CM)
        rp.commit_transaction()

        util.require_executable('rppr')
        rp.reroot()

        return rp.path
Beispiel #4
0
def build_index_refpkg(hrefpkg_paths,
                       sequence_file,
                       seqinfo,
                       taxonomy,
                       dest='index.refpkg',
                       **meta):
    """
    Build an index.refpkg from a set of hrefpkgs
    """

    # Clear taxonomy
    taxonomy = copy.deepcopy(taxonomy)
    for node in taxonomy:
        node.sequence_ids = set()

    def sequence_names(f):
        with open(f) as fp:
            r = csv.DictReader(fp)
            for i in r:
                yield i['seqname']

    hrefpkgs = (Refpkg(i, create=False) for i in hrefpkg_paths)
    seqinfo_files = (i.open_resource('seq_info') for i in hrefpkgs)

    # Add seqinfo
    for f in seqinfo_files:
        with f:
            taxonomy.populate_from_seqinfo(f)

    # Remove lineages without sequences
    taxonomy.prune_unrepresented()

    sequence_ids = frozenset(taxonomy.subtree_sequence_ids())

    with util.ntf(prefix='aln_fasta', suffix='.fasta') as tf, \
         util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp, \
         util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp:
        wrap.esl_sfetch(sequence_file, sequence_ids, tf)
        tf.close()

        # Seqinfo file
        r = (i for i in seqinfo if i['seqname'] in sequence_ids)
        w = csv.DictWriter(seq_info_fp,
                           seqinfo[0].keys(),
                           lineterminator='\n',
                           quoting=csv.QUOTE_NONNUMERIC)
        w.writeheader()
        w.writerows(r)
        seq_info_fp.close()

        taxonomy.write_taxtable(tax_fp)
        tax_fp.close()

        rp = Refpkg(dest, create=True)
        rp.start_transaction()
        rp.update_file('aln_fasta', tf.name)
        rp.update_file('seq_info', seq_info_fp.name)
        rp.update_file('taxonomy', tax_fp.name)
        rp.update_file('profile', wrap.CM)

        for k, v in meta.items():
            rp.update_metadata(k, v)

        rp.commit_transaction()

    return rp, sequence_ids