Esempio n. 1
0
import re
import string
import sys
from json import load
from os.path import join

from qp_deblur import get_data

with open(get_data(join('sepp', 'tmpl_gg13.8-99-revnamemap.json'))) as f:
    revnamemap = load(f)


def relabel_newick(newick_string):
    pattern = re.compile("(UQrYOlnDN[^(,:)<>]+)")
    invalidChars = set(string.punctuation).union(set(string.whitespace))

    def replace_func(m):
        repl = m.group(1)
        if m.group(1) in revnamemap:
            repl = revnamemap[m.group(1)]
            if any(char in invalidChars for char in repl):
                repl = "'%s'" % repl
        else:
            repl = m.group(1)

        return repl

    t = pattern.sub(replace_func, newick_string)

    return t
Esempio n. 2
0
def generate_insertion_trees(placements, out_dir,
                             reference_template=None,
                             reference_rename=None):
    """Generates phylogenetic trees by inserting placements into a reference

    Parameters
    ----------
    placements : dict of strings
        keys are the seqs, values are the new placements as JSON strings
    out_dir : str
        The job output directory
    reference_template : str, optional
        Filepath to the reference placement json file.
        This file can be produced via _generate_template_rename() and should be
        stored in the plugin package, because it can re used.
        If None, it falls back to the Greengenes 13.8 99% reference.
    reference_rename : str, optional
        Similar to reference_template, but a filepath to the generated python
        renaming script to undo the name scaping post guppy.
        If None, it falls back to the Greengenes 13.8 99% reference.

    Returns
    -------
    str
        The filepath of the phylogenetic insertion tree in Newick format.

    Raises
    ------
    ValueError
        If a) the given reference_template or reference_rename files do not
        exist
        b) or the guppy binary exits with non-zero return code
        c) or the given rename script exists with non-zero return code.
    """
    # test if reference file for rename script actually exists.
    file_ref_rename = qp_deblur.get_data(
        join('sepp', 'tmpl_gg13.8-99_rename-json.py'))
    if reference_rename is not None:
        file_ref_rename = reference_rename
    if not exists(file_ref_rename):
        raise ValueError("Reference rename script '%s' does not exits!" %
                         file_ref_rename)

    # create a valid placement.json file as input for guppy
    file_ref_template = qp_deblur.get_data(
        join('sepp', 'tmpl_gg13.8-99_placement.json'))
    if reference_template is not None:
        file_ref_template = reference_template
    if not exists(file_ref_template):
        raise ValueError("Reference template '%s' does not exits!" %
                         file_ref_template)
    with open(file_ref_template, 'r') as f:
        plcmnts = json.loads(f.read())

    plcmnts['placements'].extend(
        [{'p': placement, 'nm': [[sequence, 1]]}
         for sequence, placement
         in placements.items()])

    file_placements = '%s/placements.json' % out_dir
    with open(file_placements, 'w') as f:
        json.dump(plcmnts, f)

    # execute guppy
    file_tree_escaped = join(out_dir, 'insertion_tree.tre')
    std_out, std_err, return_value = system_call(
        'guppy tog %s -o %s' % (file_placements, file_tree_escaped))
    if return_value != 0:
        error_msg = ("Error running guppy:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        raise ValueError(error_msg)

    # execute node name re-labeling (to revert the escaping of names necessary
    # for guppy)
    file_tree = join(out_dir, 'insertion_tree.relabelled.tre')
    std_out, std_err, return_value = system_call(
        'cat %s | python %s > %s' %
        (file_tree_escaped, file_ref_rename, file_tree))
    if return_value != 0:
        error_msg = (("Error running %s:\n"
                      "Std out: %s\nStd err: %s")
                     % (file_ref_rename, std_out, std_err))
        raise ValueError(error_msg)

    # making sure that all branches in the generated tree have branch lenghts
    tree = TreeNode.read(file_tree)
    for node in tree.preorder(include_self=False):
        if node.length is None:
            node.length = 0.0
    tree.write(file_tree)

    return file_tree
Esempio n. 3
0
def deblur(qclient, job_id, parameters, out_dir):
    """Run deblur with the given parameters

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run deblur
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job

    Notes
    -----
    The code will check if the artifact has a preprocessed_demux element, if
    not it will use the preprocessed_fastq. We prefer to work with the
    preprocessed_demux as running time will be greatly improved
    """
    out_dir = join(out_dir, 'deblur_out')
    # Step 1 get the rest of the information need to run deblur
    qclient.update_job_step(job_id, "Step 1 of 4: Collecting information")
    artifact_id = parameters['Demultiplexed sequences']
    # removing input from parameters so it's not part of the final command
    del parameters['Demultiplexed sequences']

    # Get the artifact filepath information
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    fps = artifact_info['files']

    # Step 2 generating command deblur
    if 'preprocessed_demux' in fps:
        qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample "
                                "from demux (1/2)")

        if not exists(out_dir):
            mkdir(out_dir)
        split_out_dir = join(out_dir, 'split')
        if not exists(split_out_dir):
            mkdir(split_out_dir)

        # using the same number of parallel jobs as defined by the command
        n_jobs = int(parameters['Jobs to start'])
        # [0] cause there should be only 1 file
        to_per_sample_files(fps['preprocessed_demux'][0],
                            out_dir=split_out_dir, n_jobs=n_jobs)

        qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample "
                                "from demux (2/2)")
        out_dir = join(out_dir, 'deblured')
        cmd = generate_deblur_workflow_commands([split_out_dir],
                                                out_dir, parameters)
    else:
        qclient.update_job_step(job_id, "Step 2 of 4: Generating deblur "
                                "command")
        cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'],
                                                out_dir, parameters)

    # Step 3 execute deblur
    qclient.update_job_step(job_id, "Step 3 of 4: Executing deblur job")
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        return False, None, error_msg

    # Generating artifact
    pb = partial(join, out_dir)

    # Generate the filepaths
    final_biom = pb('all.biom')
    final_seqs = pb('all.seqs.fa')
    final_biom_hit = pb('reference-hit.biom')
    final_seqs_hit = pb('reference-hit.seqs.fa')

    if not exists(final_biom_hit):
        # Create an empty table. We need to send something to Qiita that is
        # a valid BIOM, so we are going to create an empty table
        t = Table([], [], [])
        with biom_open(final_biom_hit, 'w') as f:
            t.to_hdf5(f, 'qp-deblur generated')

    if not exists(final_seqs_hit):
        # Same as before, create an empty sequence file so we can send it
        with open(final_seqs_hit, 'w') as f:
            f.write("")

    # Step 4, communicate with archive to check and generate placements
    qclient.update_job_step(job_id, "Step 4 of 4 (1/4): Retrieving "
                            "observations information")
    features = list(load_table(final_biom_hit).ids(axis='observation'))

    fp_phylogeny = None
    if features:
        observations = qclient.post(
            "/qiita_db/archive/observations/", data={'job_id': job_id,
                                                     'features': features})
        novel_fragments = list(set(features) - set(observations.keys()))

        qclient.update_job_step(job_id, "Step 4 of 4 (2/4): Generating %d new "
                                "placements" % len(novel_fragments))

        # Once we support alternative reference phylogenies for SEPP in the
        # future, we need to translate the reference name here into
        # filepaths pointing to the correct reference alignment and
        # reference tree. If left 'None' the Greengenes 13.8 reference
        # shipped with the fragment-insertion conda package will be used.
        fp_reference_alignment = None
        fp_reference_phylogeny = None
        fp_reference_template = None
        fp_reference_rename = None
        if 'Reference phylogeny for SEPP' in parameters:
            if parameters['Reference phylogeny for SEPP'] == 'tiny':
                fp_reference_alignment = qp_deblur.get_data(join(
                    'sepp', 'reference_alignment_tiny.fasta'))
                fp_reference_phylogeny = qp_deblur.get_data(join(
                    'sepp', 'reference_phylogeny_tiny.nwk'))
                fp_reference_template = qp_deblur.get_data(join(
                    'sepp', 'tmpl_tiny_placement.json'))
                fp_reference_rename = qp_deblur.get_data(join(
                    'sepp', 'tmpl_tiny_rename-json.py'))
        try:
            new_placements = generate_sepp_placements(
                novel_fragments, out_dir, parameters['Threads per sample'],
                reference_alignment=fp_reference_alignment,
                reference_phylogeny=fp_reference_phylogeny)
        except ValueError as e:
            return False, None, str(e)

        qclient.update_job_step(job_id, "Step 4 of 4 (3/4): Archiving %d "
                                "new placements" % len(novel_fragments))
        # values needs to be json strings as well
        for fragment in new_placements.keys():
            new_placements[fragment] = json.dumps(new_placements[fragment])

        # fragments that get rejected by a SEPP run don't show up in
        # the placement file, however being rejected is a valuable
        # information and should be stored in the archive as well.
        # Thus, we avoid re-computation for rejected fragments in the
        # future.
        for fragment in novel_fragments:
            if fragment not in new_placements:
                new_placements[fragment] = ""
        if len(new_placements.keys()) > 0:
            qclient.patch(url="/qiita_db/archive/observations/", op="add",
                          path=job_id, value=json.dumps(new_placements))

        # retrieve all fragments and create actuall tree
        qclient.update_job_step(job_id, "Step 4 of 4 (4/4): Composing "
                                "phylogenetic insertion tree")
        placements = qclient.post(
            "/qiita_db/archive/observations/", data={'job_id': job_id,
                                                     'features': features})
        # remove fragments that have been rejected by SEPP, i.e. whoes
        # placement is the empty string and
        # convert all other placements from string to json
        placements = {frag: json.loads(placements[frag])
                      for frag, plc
                      in placements.items()
                      if plc != ''}
        try:
            fp_phylogeny = generate_insertion_trees(
                placements, out_dir,
                reference_template=fp_reference_template,
                reference_rename=fp_reference_rename)
        except ValueError as e:
            return False, None, str(e)
    else:
        new_placements = None

    ainfo = [ArtifactInfo('deblur final table', 'BIOM',
                          [(final_biom, 'biom'),
                           (final_seqs, 'preprocessed_fasta')])]
    if fp_phylogeny is not None:
        ainfo.append(ArtifactInfo('deblur reference hit table', 'BIOM',
                     [(final_biom_hit, 'biom'),
                      (final_seqs_hit, 'preprocessed_fasta'),
                      (fp_phylogeny, 'plain_text')], new_placements))

    return True, ainfo, ""
Esempio n. 4
0
import re
import string
import sys
from json import load
from os.path import join

from qp_deblur import get_data

with open(get_data(join('sepp', 'tmpl_tiny-revnamemap.json'))) as f:
    revnamemap = load(f)


def relabel_newick(newick_string):
    pattern = re.compile("(UQrYOlnDN[^(,:)<>]+)")
    invalidChars = set(string.punctuation).union(set(string.whitespace))

    def replace_func(m):
        repl = m.group(1)
        if m.group(1) in revnamemap:
            repl = revnamemap[m.group(1)]
            if any(char in invalidChars for char in repl):
                repl = "'%s'" % repl
        else:
            repl = m.group(1)

        return repl

    t = pattern.sub(replace_func, newick_string)
    return t