Example #1
0
def _summary_FASTA_preprocessed(artifact_type, filepaths, out_dir):
    """Generates the HTML summary for Demultiplexed artifacts

    Parameters
    ----------
    artifact_type : str
        The artifact type
    filepaths : [(str, str)]
        A list of string pairs where the first element is the filepath and the
        second is the filepath type
    out_dir : str
        The output folder

    Returns
    -------
    list
        A list of strings with the html summary
    """
    files = filepaths.get('preprocessed_fasta')
    cmd = f"quast %s -o {out_dir}/quast" % ' '.join(files)
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        artifact_information = (
            "Std out: %s\nStd err: %s\n\nCommand run was:\n%s" %
            (std_out, std_err, cmd))
    else:
        with open(f'{out_dir}/quast/report.html', 'r') as f:
            artifact_information = f.readlines()

    return artifact_information
Example #2
0
def _generate_alpha_vector_summary(files, metadata, out_dir):
    # Magic number [0] -> there is only one plain text file and it is the
    # alpha vector
    alpha_vector_fp = files['plain_text'][0]
    alpha_qza = join(out_dir, 'alpha_vectors.qza')
    alpha_qzv = join(out_dir, 'alpha_vectors.qzv')
    metadata_fp = join(out_dir, 'sample-metadata.tsv')

    # Get the SampleData[AlphaDiversity] qiime2 artifact
    cmd = ('qiime tools import --input-path %s --output-path %s '
           '--type "SampleData[AlphaDiversity]"' %
           (alpha_vector_fp, alpha_qza))
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = "Error converting the alpha vectors file to Q2 artifact"
        raise RuntimeError(error_msg)

    # Generate the metadata file
    metadata = pd.DataFrame.from_dict(metadata, orient='index')
    metadata.to_csv(metadata_fp,
                    index_label='#SampleID',
                    na_rep='',
                    sep='\t',
                    encoding='utf-8')

    # Execute alpha group significance
    cmd = ('qiime diversity alpha-group-significance --i-alpha-diversity %s '
           '--m-metadata-file %s --o-visualization %s' %
           (alpha_qza, metadata_fp, alpha_qzv))
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        raise RuntimeError(
            "Error executing alpha-group-significance for the summary:\n%s" %
            std_err)

    # Extract the Q2 visualization to use it as html_summary
    q2vis = Visualization.load(alpha_qzv)
    html_dir = join(out_dir, 'support_files')
    html_fp = join(out_dir, 'index.html')

    q2vis.export_data(html_dir)
    index_paths = q2vis.get_index_paths()
    index_name = basename(index_paths['html'])
    with open(html_fp, 'w') as f:
        f.write(Q2_INDEX % index_name)

    return html_fp, html_dir
Example #3
0
def _run_commands(qclient, job_id, commands, msg):
    for i, cmd in enumerate(commands):
        qclient.update_job_step(job_id, msg % i)
        std_out, std_err, return_value = system_call(cmd)
        if return_value != 0:
            error_msg = ("Error running HUMANn2:\nStd out: %s\nStd err: %s" %
                         (std_out, std_err))
            return False, error_msg

    return True, ""
Example #4
0
def _run_commands(qclient, job_id, commands, msg, cmd_name):
    for i, cmd in enumerate(commands):
        qclient.update_job_step(job_id, msg % i)
        std_out, std_err, return_value = system_call(cmd)
        if return_value != 0:
            error_msg = ("Error running %s:\nStd out: %s\nStd err: %s"
                         "\n\nCommand run was:\n%s" %
                         (cmd_name, std_out, std_err, cmd))
            return False, error_msg

    return True, ""
Example #5
0
def _generate_feature_data(files, metadata, out_dir):
    # Magic number [0] -> there is only one plain text file and it is the
    # feature data
    fdt_fp = files['plain_text'][0]

    if 'qza' not in files:
        fdt_qza = join(out_dir, 'taxonomy.qza')
        # Get the SampleData[AlphaDiversity] qiime2 artifact
        cmd = ('qiime tools import --input-path %s --output-path %s '
               '--type "FeatureData[Taxonomy]"' % (fdt_fp, fdt_qza))
        std_out, std_err, return_value = system_call(cmd)
        if return_value != 0:
            error_msg = ("Error converting the file to Q2 artifact")
            raise RuntimeError(error_msg)
    else:
        fdt_qza = files['qza'][0]

    fdt_qzv = join(out_dir, 'feature-data.qzv')
    cmd = ('qiime metadata tabulate --m-input-file %s --o-visualization %s' %
           (fdt_qza, fdt_qzv))
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = "Error tabulating Q2 artifact"
        raise RuntimeError(error_msg)

    # Extract the Q2 visualization to use it as html_summary
    q2vis = Visualization.load(fdt_qzv)
    html_dir = join(out_dir, 'support_files')
    html_fp = join(out_dir, 'index.html')

    q2vis.export_data(html_dir)
    index_paths = q2vis.get_index_paths()
    index_name = basename(index_paths['html'])
    with open(html_fp, 'w') as f:
        f.write(Q2_INDEX % index_name)

    return html_fp, html_dir
Example #6
0
def _gzip_file(filepath, test=False):
    """gzip the given filepath if needed

    Parameters
    ----------
    filepath : string
        The filepath to verify or compress
    test : bolean
        If True do not compress but change the filename, used for unit testing

    Returns
    -------
    str
        the new gz filepath, None if error
    str
        the error, None if success
    """
    error = None
    return_fp = filepath
    if test:
        return_fp = '%s.gz' % filepath
    else:
        is_gzip = False
        try:
            with gopen(filepath, 'rb') as f:
                f.read(1)
            is_gzip = True
        except (OSError, IOError):
            pass

        if not is_gzip:
            gz_cmd = 'pigz -p 5 -c {0} > {0}.gz'.format(filepath)

            std_out, std_err, return_value = system_call(gz_cmd)
            if return_value != 0 and not test:
                error = ("Std out: %s\nStd err: %s\n\nCommand run was:\n%s" %
                         (std_out, std_err, gz_cmd))
            else:
                # removing non gz file
                remove(filepath)
                return_fp = '%s.gz' % filepath
    return return_fp, error
Example #7
0
def beta_group_significance(qclient, job_id, parameters, out_dir):
    """generate beta group significance calculations

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values for beta correlation
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job
    """
    out_dir = join(out_dir, 'beta_group_significance')
    if not exists(out_dir):
        mkdir(out_dir)

    qclient.update_job_step(job_id, "Step 1 of 3: Collecting information")
    artifact_id = parameters['Distance matrix']
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    dm_fp = artifact_info['files']['plain_text'][0]
    dm_qza = join(out_dir, 'q2-distance.qza')
    analysis_id = artifact_info['analysis']
    metadata = qclient.get(
        "/qiita_db/analysis/%s/metadata/" % str(analysis_id))
    metadata = pd.DataFrame.from_dict(metadata, orient='index')
    metadata_fp = join(out_dir, 'metadata.txt')
    metadata.to_csv(metadata_fp, sep='\t')
    m_metadata_category = parameters['Metadata category']
    p_method = BETA_GROUP_SIG_METHODS[parameters['Method']]
    p_permutations = parameters['Number of permutations']
    p_pairwise = BETA_GROUP_SIG_TYPE[parameters['Comparison type']]
    o_visualization = join(out_dir, 'beta_group_significance.qzv')

    qclient.update_job_step(
        job_id, "Step 2 of 3: Converting Qiita artifacts to Q2 artifact")
    cmd = ('qiime tools import --input-path %s --output-path %s '
           '--type "DistanceMatrix"' % (dm_fp, dm_qza))
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error converting distance matrix:\nStd out: %s\n"
                     "Std err: %s" % (std_out, std_err))
        return False, None, error_msg

    qclient.update_job_step(
        job_id, "Step 3 of 3: Calculating beta group significance")
    cmd = ('qiime diversity beta-group-significance --i-distance-matrix %s '
           '--m-metadata-file %s --m-metadata-category %s --p-method %s '
           '--p-permutations %s --o-visualization %s --%s' % (
               dm_qza, metadata_fp, m_metadata_category, p_method,
               p_permutations, o_visualization, p_pairwise))
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error in beta group significance\nStd out: %s\n"
                     "Std err: %s" % (std_out, std_err))
        return False, None, error_msg

    ainfo = [ArtifactInfo('Beta group significance visualization',
                          'q2_visualization',
                          [(o_visualization, 'qzv')])]
    return True, ainfo, ""
Example #8
0
 def test_system_call_error(self):
     obs_out, obs_err, obs_val = system_call("IHopeThisCommandDoesNotExist")
     self.assertEqual(obs_out, "")
     self.assertTrue("not found" in obs_err)
     self.assertEqual(obs_val, 127)
Example #9
0
def deblur(qclient, job_id, parameters, out_dir):
    """Run deblur with the given parameters

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run deblur
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job

    Notes
    -----
    The code will check if the artifact has a preprocessed_demux element, if
    not it will use the preprocessed_fastq. We prefer to work with the
    preprocessed_demux as running time will be greatly improved
    """
    out_dir = join(out_dir, 'deblur_out')
    # Step 1 get the rest of the information need to run deblur
    qclient.update_job_step(job_id, "Step 1 of 4: Collecting information")
    artifact_id = parameters['Demultiplexed sequences']
    # removing input from parameters so it's not part of the final command
    del parameters['Demultiplexed sequences']

    # Get the artifact filepath information
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    fps = artifact_info['files']

    # Step 2 generating command deblur
    if 'preprocessed_demux' in fps:
        qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample "
                                "from demux (1/2)")

        if not exists(out_dir):
            mkdir(out_dir)
        split_out_dir = join(out_dir, 'split')
        if not exists(split_out_dir):
            mkdir(split_out_dir)

        # using the same number of parallel jobs as defined by the command
        n_jobs = int(parameters['Jobs to start'])
        # [0] cause there should be only 1 file
        to_per_sample_files(fps['preprocessed_demux'][0],
                            out_dir=split_out_dir, n_jobs=n_jobs)

        qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample "
                                "from demux (2/2)")
        out_dir = join(out_dir, 'deblured')
        cmd = generate_deblur_workflow_commands([split_out_dir],
                                                out_dir, parameters)
    else:
        qclient.update_job_step(job_id, "Step 2 of 4: Generating deblur "
                                "command")
        cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'],
                                                out_dir, parameters)

    # Step 3 execute deblur
    qclient.update_job_step(job_id, "Step 3 of 4: Executing deblur job")
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        return False, None, error_msg

    # Generating artifact
    pb = partial(join, out_dir)

    # Generate the filepaths
    final_biom = pb('all.biom')
    final_seqs = pb('all.seqs.fa')
    final_biom_hit = pb('reference-hit.biom')
    final_seqs_hit = pb('reference-hit.seqs.fa')

    if not exists(final_biom_hit):
        # Create an empty table. We need to send something to Qiita that is
        # a valid BIOM, so we are going to create an empty table
        t = Table([], [], [])
        with biom_open(final_biom_hit, 'w') as f:
            t.to_hdf5(f, 'qp-deblur generated')

    if not exists(final_seqs_hit):
        # Same as before, create an empty sequence file so we can send it
        with open(final_seqs_hit, 'w') as f:
            f.write("")

    # Step 4, communicate with archive to check and generate placements
    qclient.update_job_step(job_id, "Step 4 of 4 (1/4): Retrieving "
                            "observations information")
    features = list(load_table(final_biom_hit).ids(axis='observation'))

    fp_phylogeny = None
    if features:
        observations = qclient.post(
            "/qiita_db/archive/observations/", data={'job_id': job_id,
                                                     'features': features})
        novel_fragments = list(set(features) - set(observations.keys()))

        qclient.update_job_step(job_id, "Step 4 of 4 (2/4): Generating %d new "
                                "placements" % len(novel_fragments))

        # Once we support alternative reference phylogenies for SEPP in the
        # future, we need to translate the reference name here into
        # filepaths pointing to the correct reference alignment and
        # reference tree. If left 'None' the Greengenes 13.8 reference
        # shipped with the fragment-insertion conda package will be used.
        fp_reference_alignment = None
        fp_reference_phylogeny = None
        fp_reference_template = None
        fp_reference_rename = None
        if 'Reference phylogeny for SEPP' in parameters:
            if parameters['Reference phylogeny for SEPP'] == 'tiny':
                fp_reference_alignment = qp_deblur.get_data(join(
                    'sepp', 'reference_alignment_tiny.fasta'))
                fp_reference_phylogeny = qp_deblur.get_data(join(
                    'sepp', 'reference_phylogeny_tiny.nwk'))
                fp_reference_template = qp_deblur.get_data(join(
                    'sepp', 'tmpl_tiny_placement.json'))
                fp_reference_rename = qp_deblur.get_data(join(
                    'sepp', 'tmpl_tiny_rename-json.py'))
        try:
            new_placements = generate_sepp_placements(
                novel_fragments, out_dir, parameters['Threads per sample'],
                reference_alignment=fp_reference_alignment,
                reference_phylogeny=fp_reference_phylogeny)
        except ValueError as e:
            return False, None, str(e)

        qclient.update_job_step(job_id, "Step 4 of 4 (3/4): Archiving %d "
                                "new placements" % len(novel_fragments))
        # values needs to be json strings as well
        for fragment in new_placements.keys():
            new_placements[fragment] = json.dumps(new_placements[fragment])

        # fragments that get rejected by a SEPP run don't show up in
        # the placement file, however being rejected is a valuable
        # information and should be stored in the archive as well.
        # Thus, we avoid re-computation for rejected fragments in the
        # future.
        for fragment in novel_fragments:
            if fragment not in new_placements:
                new_placements[fragment] = ""
        if len(new_placements.keys()) > 0:
            qclient.patch(url="/qiita_db/archive/observations/", op="add",
                          path=job_id, value=json.dumps(new_placements))

        # retrieve all fragments and create actuall tree
        qclient.update_job_step(job_id, "Step 4 of 4 (4/4): Composing "
                                "phylogenetic insertion tree")
        placements = qclient.post(
            "/qiita_db/archive/observations/", data={'job_id': job_id,
                                                     'features': features})
        # remove fragments that have been rejected by SEPP, i.e. whoes
        # placement is the empty string and
        # convert all other placements from string to json
        placements = {frag: json.loads(placements[frag])
                      for frag, plc
                      in placements.items()
                      if plc != ''}
        try:
            fp_phylogeny = generate_insertion_trees(
                placements, out_dir,
                reference_template=fp_reference_template,
                reference_rename=fp_reference_rename)
        except ValueError as e:
            return False, None, str(e)
    else:
        new_placements = None

    ainfo = [ArtifactInfo('deblur final table', 'BIOM',
                          [(final_biom, 'biom'),
                           (final_seqs, 'preprocessed_fasta')])]
    if fp_phylogeny is not None:
        ainfo.append(ArtifactInfo('deblur reference hit table', 'BIOM',
                     [(final_biom_hit, 'biom'),
                      (final_seqs_hit, 'preprocessed_fasta'),
                      (fp_phylogeny, 'plain_text')], new_placements))

    return True, ainfo, ""
Example #10
0
def _generate_template_rename(file_reference_phylogeny,
                              file_reference_alignment,
                              out_dir):
    """Produces placement template and rename script for reference phylogeny.

    Parameters
    ----------
    file_reference_phylogeny : str
        A filepath to an alternative reference phylogeny for SEPP.
    file_reference_alignment : str
        A filepath to an alternative reference alignment for SEPP.
    out_dir : str
        The job output directory

    Returns
    -------
    (str, str) : Filepaths of reference_template json file and
    reference_rename python script.

    Raises
    ------
    ValueError
        If a) the given out_dir directory does not exist.
        b) the given reference phylogeny or alignment does not exist.
        c) the run-sepp.sh wrapper script fails for any reason.

    Notes
    -----
    This function only needs to be called once per reference phylogeny/
    alignment, i.e. if we update Greengenes or extend SEPP for Silva or other
    reference phylogenies. I am including this function for easier maintainance
    in the future.
    """
    if not exists(out_dir):
        raise ValueError("Output directory '%s' does not exist!" % out_dir)
    if not exists(file_reference_phylogeny):
        raise ValueError("Reference phylogeny file '%s' does not exits!" %
                         file_reference_phylogeny)
    if not exists(file_reference_alignment):
        raise ValueError("Reference alignment file '%s' does not exits!" %
                         file_reference_alignment)

    # create a dummy sequence input file
    file_input = '%s/input.fasta' % out_dir
    with open(file_input, 'w') as f:
        f.write('>dummySeq\n')
        f.write('TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGATGGA'
                'CAAGTCTGATGTGAAAGGCTGGGGCCCAACCCCGGGACTGCATTGGAAACTGCCCGTCTT'
                'GAGTG\n')
    std_out, std_err, return_value = system_call(
        'cd %s; run-sepp.sh %s dummy -x 1 -a %s -t %s' %
        (out_dir, file_input, file_reference_alignment,
         file_reference_phylogeny))
    if return_value != 0:
        error_msg = ("Error running SEPP:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        raise ValueError(error_msg)

    # take resulting placement.json and turn it into the template by
    # clearing the list of placements
    file_template = '%s/tmpl_dummy_placement.json' % out_dir
    with open('%s/dummy_placement.json' % out_dir, 'r') as f:
        placements = json.loads(f.read())
        placements['placements'] = []
        with open(file_template, 'w') as fw:
            json.dump(placements, fw)

    # Another file produced by SEPP is xxx_rename-json.py, where xxx is the
    # name of the run, here "dummy". SEPP needs to escape node names before the
    # reference tree is given to guppy which can only handle a limited name
    # format. Thus, after guppy, the result needs to be back translated to
    # original names with the rename-json.py script that is generated by SEPP.
    return (file_template, '%s/dummy_rename-json.py' % out_dir)
Example #11
0
 def test_system_call_error(self):
     obs_out, obs_err, obs_val = system_call("IHopeThisCommandDoesNotExist")
     self.assertEqual(obs_out, "")
     self.assertTrue("not found" in obs_err)
     self.assertEqual(obs_val, 127)
Example #12
0
def beta_diversity(qclient, job_id, parameters, out_dir):
    """generate beta diversity calculations

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values for beta diversity
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job
    """
    out_dir = join(out_dir, 'beta_diversity')
    if not exists(out_dir):
        mkdir(out_dir)

    qclient.update_job_step(job_id, "Step 1 of 4: Collecting information")
    artifact_id = parameters['BIOM table']
    metric = BETA_DIVERSITY_METRICS[parameters['Diversity metric']]
    tree = parameters['Phylogenetic tree']
    if tree == 'None':
        tree = None
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    biom_fpi = artifact_info['files']['biom'][0]
    biom_qza = join(out_dir, 'q2-biom.qza')
    num_jobs = parameters['Number of jobs']

    qclient.update_job_step(
        job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact")
    # converting biom
    cmd = ('qiime tools import --input-path %s --output-path %s '
           '--type "FeatureTable[Frequency]' % (biom_fpi, biom_qza))
    b = load_table(biom_fpi)
    counts = list(map(sum, b.iter_data()))
    if min(counts) == max(counts):
        cmd += " % Properties(['uniform-sampling'])\""
    else:
        cmd += '"'
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error converting biom:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        return False, None, error_msg
    # converting tree
    if tree is not None:
        qza_tree = join(out_dir, 'tree.qza')
        cmd = ('qiime tools import --input-path %s --type Phylogeny[Rooted] '
               '--output-path %s' % (tree, qza_tree))
        tree = qza_tree
        std_out, std_err, return_value = system_call(cmd)
        if return_value != 0:
            error_msg = ("Error converting tree:\nStd out: %s\nStd err: %s"
                         % (std_out, std_err))
            return False, None, error_msg

    qclient.update_job_step(
        job_id, "Step 3 of 4: Calculating beta diversity: %s" % (metric))
    if tree is not None and metric in STATE_UNIFRAC_METRICS:
        su_metric = STATE_UNIFRAC_METRICS[metric]
        dtx_fp = join(out_dir, '%s.qza' % su_metric)
        cmd = ('qiime diversity beta-phylogenetic-alt --p-metric %s '
               '--i-table %s --i-phylogeny %s --o-distance-matrix %s '
               '--p-n-jobs %s'
               % (su_metric, biom_qza, tree, dtx_fp, num_jobs))
        if parameters['Adjust variance (phylogenetic only)']:
            cmd += ' --p-variance-adjusted'
        if parameters['Bypass tips (phylogenetic only)']:
            cmd += ' --p-bypass-tips'
        if su_metric == 'generalized_unifrac':
            cmd += '--p-alpha %s' % parameters[
                'Alpha value (Generalized Unifrac only)']
    elif metric not in STATE_UNIFRAC_METRICS and tree is None:
        dtx_fp = join(out_dir, '%s.qza' % metric)
        cmd = ('qiime diversity beta --i-table %s --p-metric %s '
               '--o-distance-matrix %s --p-n-jobs %s'
               % (biom_qza, metric, dtx_fp, num_jobs))
    else:
        return False, None, ('Phylogenetic metric %s selected but no tree '
                             'exists' % metric)

    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error in beta div %s:\nStd out: %s\nStd err: %s"
                     % (metric, std_out, std_err))
        return False, None, error_msg

    qclient.update_job_step(
        job_id, "Step 4 of 4: Converting Q2 to Qiita artifacts")
    fdir = join(out_dir, 'dtx')
    ffp = join(fdir, 'distance-matrix.tsv')
    cmd = "qiime tools export --output-dir %s %s" % (fdir, dtx_fp)
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error in Q2 -> Qiita conversion:\nStd out: "
                     "%s\nStd err: %s" % (std_out, std_err))
        return False, None, error_msg

    ainfo = [ArtifactInfo('Distance matrix', 'distance_matrix',
                          [(ffp, 'plain_text')])]
    return True, ainfo, ""
Example #13
0
def shogun(qclient, job_id, parameters, out_dir):
    """Run Shogun with the given parameters

    Parameters
    ----------
    qclient : tgp.qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run split libraries
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    bool, list, str
        The results of the job
    """
    # Step 1 get the rest of the information need to run Atropos
    qclient.update_job_step(job_id, "Step 1 of 6: Collecting information")
    artifact_id = parameters['input']
    del parameters['input']

    # Get the artifact filepath information
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    fps = artifact_info['files']

    # Get the artifact metadata
    prep_info = qclient.get('/qiita_db/prep_template/%s/' %
                            artifact_info['prep_information'][0])
    qiime_map = prep_info['qiime-map']

    # Step 2 converting to fna
    qclient.update_job_step(job_id,
                            "Step 2 of 6: Converting to FNA for Shogun")

    rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else []
    samples = make_read_pairs_per_sample(fps['raw_forward_seqs'], rs,
                                         qiime_map)

    # Combining files
    comb_fp = generate_fna_file(out_dir, samples)

    # Formatting parameters
    parameters = _format_params(parameters, SHOGUN_PARAMS)

    # Step 3 align
    align_cmd = generate_shogun_align_commands(comb_fp, out_dir, parameters)
    sys_msg = "Step 3 of 6: Aligning FNA with Shogun (%d/{0})".format(
        len(align_cmd))
    success, msg = _run_commands(qclient, job_id, align_cmd, sys_msg,
                                 'Shogun Align')

    if not success:
        return False, None, msg

    # Step 4 taxonomic profile
    sys_msg = "Step 4 of 6: Taxonomic profile with Shogun (%d/{0})"
    assign_cmd, profile_fp = generate_shogun_assign_taxonomy_commands(
        out_dir, parameters)
    success, msg = _run_commands(qclient, job_id, assign_cmd, sys_msg,
                                 'Shogun taxonomy assignment')
    if not success:
        return False, None, msg

    sys_msg = "Step 5 of 6: Compressing and converting alignment to BIOM"
    qclient.update_job_step(job_id, msg)
    alignment_fp = join(
        out_dir, 'alignment.%s.%s' %
        (parameters['aligner'], ALN2EXT[parameters['aligner']]))
    xz_cmd = 'xz -9 -T%s %s' % (parameters['threads'], alignment_fp)
    std_out, std_err, return_value = system_call(xz_cmd)
    if return_value != 0:
        error_msg = ("Error during %s:\nStd out: %s\nStd err: %s"
                     "\n\nCommand run was:\n%s" %
                     (sys_msg, std_out, std_err, xz_cmd))
        return False, None, error_msg
    output = run_shogun_to_biom(profile_fp, [None, None, None, True], out_dir,
                                'profile')

    ainfo = [
        ArtifactInfo('Shogun Alignment Profile', 'BIOM',
                     [(output, 'biom'), ('%s.xz' % alignment_fp, 'log')])
    ]

    # Step 5 redistribute profile
    sys_msg = "Step 6 of 6: Redistributed profile with Shogun (%d/{0})"
    levels = ['phylum', 'genus', 'species']
    redist_fps = []
    for level in levels:
        redist_cmd, output = generate_shogun_redist_commands(
            profile_fp, out_dir, parameters, level)
        redist_fps.append(output)
        success, msg = _run_commands(qclient, job_id, redist_cmd, sys_msg,
                                     'Shogun redistribute')
        if not success:
            return False, None, msg
    # Converting redistributed files to biom
    for redist_fp, level in zip(redist_fps, levels):
        biom_in = ["redist", None, '', True]
        output = run_shogun_to_biom(redist_fp, biom_in, out_dir, level,
                                    'redist')
        aname = 'Taxonomic Predictions - %s' % level
        ainfo.append(ArtifactInfo(aname, 'BIOM', [(output, 'biom')]))

    return True, ainfo, ""
Example #14
0
def _summary_not_demultiplexed(artifact_type, filepaths):
    """Generates the HTML summary for non Demultiplexed artifacts

    Parameters
    ----------
    artifact_type : str
        The artifact type
    filepaths : [(str, str)]
        A list of string pairs where the first element is the filepath and the
        second is the filepath type

    Returns
    -------
    list
        A list of strings with the html summary
    """
    # loop over each of the fps/fps_type pairs
    artifact_information = []
    errors = []
    df = None
    for fps_type, fps in sorted(filepaths.items()):
        if fps_type in {'html_summary'}:
            continue
        # Step 2: generate HTML summary
        # md5, from http://stackoverflow.com/a/3431838
        for i, fp in enumerate(fps):
            fn = basename(fp)
            with open(fp, "rb") as f:
                hash_md5 = md5()
                for chunk in iter(lambda: f.read(4096), b""):
                    hash_md5.update(chunk)
            data = {
                'filename': fn,
                'md5': hash_md5.hexdigest(),
                'file_type': fps_type
            }

            if artifact_type not in FILEPATH_TYPE_NO_FQTOOLS:
                # check if the validate summary is present
                if i == 0:
                    fdata = f'{dirname(fp)}/qtp-sequencing-validate-data.csv'
                    if exists(fdata):
                        df = pd.read_csv(fdata, index_col=None)

                if df is None:
                    cmd = f'fqtools count {fp}'
                    std_out, std_err, return_value = system_call(cmd)
                    if std_err or return_value != 0:
                        errors.append(f'{fn}: {std_err}')
                        reads = None
                    else:
                        reads = int(std_out)
                else:
                    reads = df[(df.filename == fn)
                               & (df.file_type == fps_type)]
                    # [0] there is only one value
                    reads = reads.reads.values[0]
                data['reads'] = reads

            artifact_information.append(data)

    if errors:
        raise ValueError('Found errors: \n %s' % ''.join(errors))

    df = pd.DataFrame(artifact_information)
    order = ['file_type', 'reads'] if 'reads' in df.columns else ['file_type']
    df.sort_values(order, inplace=True)

    return df.to_html(index=False)
Example #15
0
def pcoa(qclient, job_id, parameters, out_dir):
    """generate pcoa calculations

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values for pcoa
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job
    """
    out_dir = join(out_dir, 'pcoa')
    if not exists(out_dir):
        mkdir(out_dir)

    qclient.update_job_step(job_id, "Step 1 of 4: Collecting information")
    artifact_id = parameters['Distance matrix']
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    dm_fp = artifact_info['files']['plain_text'][0]
    dm_qza = join(out_dir, 'q2-distance.qza')
    pcoa_qza = join(out_dir, 'q2-pcoa.qza')

    qclient.update_job_step(
        job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact")
    cmd = ('qiime tools import --input-path %s --output-path %s '
           '--type "DistanceMatrix"' % (dm_fp, dm_qza))
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error converting distance matrix:\nStd out: %s\n"
                     "Std err: %s" % (std_out, std_err))
        return False, None, error_msg

    qclient.update_job_step(
        job_id, "Step 3 of 4: Calculating pcoa")
    cmd = ('qiime diversity pcoa --i-distance-matrix %s --o-pcoa %s' % (
        dm_qza, pcoa_qza))

    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error in PCoA\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        return False, None, error_msg

    qclient.update_job_step(
        job_id, "Step 4 of 4: Converting Q2 to Qiita artifacts")
    fdir = join(out_dir, 'pcoa')
    ffp = join(fdir, 'ordination.txt')
    cmd = "qiime tools export --output-dir %s %s" % (fdir, pcoa_qza)
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error in Q2 -> Qiita conversion:\nStd out: "
                     "%s\nStd err: %s" % (std_out, std_err))
        return False, None, error_msg

    ainfo = [ArtifactInfo('Ordination results', 'ordination_results',
                          [(ffp, 'plain_text')])]
    return True, ainfo, ""
Example #16
0
 def test_system_call(self):
     obs_out, obs_err, obs_val = system_call("pwd")
     self.assertEqual(obs_out, "%s\n" % getcwd())
     self.assertEqual(obs_err, "")
     self.assertEqual(obs_val, 0)
Example #17
0
def alpha_diversity(qclient, job_id, parameters, out_dir):
    """generate alpha diversity calculations

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values for alpha diversity
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job
    """
    out_dir = join(out_dir, 'alpha_diversity')
    if not exists(out_dir):
        mkdir(out_dir)

    qclient.update_job_step(job_id, "Step 1 of 4: Collecting information")
    artifact_id = parameters['BIOM table']
    metric = ALPHA_DIVERSITY_METRICS[parameters['Diversity metric']]
    tree = parameters['Phylogenetic tree']
    if tree == 'None':
        tree = None
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    biom_fpi = artifact_info['files']['biom'][0]
    biom_qza = join(out_dir, 'q2-biom.qza')

    qclient.update_job_step(
        job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact")
    # converting biom
    cmd = ('qiime tools import --input-path %s --output-path %s '
           '--type "FeatureTable[Frequency]' % (biom_fpi, biom_qza))
    b = load_table(biom_fpi)
    counts = list(map(sum, b.iter_data()))
    if min(counts) == max(counts):
        cmd += " % Properties(['uniform-sampling'])\""
    else:
        cmd += '"'
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error converting biom:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        return False, None, error_msg
    # converting tree
    if tree is not None:
        qza_tree = join(out_dir, 'tree.qza')
        cmd = ('qiime tools import --input-path %s --type Phylogeny[Rooted] '
               '--output-path %s' % (tree, qza_tree))
        tree = qza_tree
        std_out, std_err, return_value = system_call(cmd)
        if return_value != 0:
            error_msg = ("Error converting tree:\nStd out: %s\nStd err: %s"
                         % (std_out, std_err))
            return False, None, error_msg

    qclient.update_job_step(
        job_id, "Step 3 of 4: Calculating alpha diversity: %s" % (metric))
    alpha_fp = join(out_dir, '%s.qza' % metric)
    if tree is not None and metric in ALPHA_PHYLOGENETIC_METRICS:
        cmd = 'qiime diversity alpha-phylogenetic --i-phylogeny %s ' % tree
    elif metric not in ALPHA_PHYLOGENETIC_METRICS and tree is None:
        cmd = 'qiime diversity alpha '
    else:
        return False, None, ('Phylogenetic metric %s selected but no tree '
                             'exists' % metric)
    cmd += '--i-table %s --p-metric %s --o-alpha-diversity %s' % (
        biom_qza, metric, alpha_fp)

    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error in alpha div %s:\nStd out: %s\nStd err: %s"
                     % (metric, std_out, std_err))
        return False, None, error_msg

    qclient.update_job_step(
        job_id, "Step 4 of 4: Converting Q2 to Qiita artifacts")
    fdir = join(out_dir, 'alpha')
    ffp = join(fdir, 'alpha-diversity.tsv')
    cmd = "qiime tools export --output-dir %s %s" % (fdir, alpha_fp)
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error in Q2 -> Qiita conversion:\nStd out: "
                     "%s\nStd err: %s" % (std_out, std_err))
        return False, None, error_msg

    ainfo = [ArtifactInfo('Alpha vectors', 'alpha_vector',
                          [(ffp, 'plain_text')])]
    return True, ainfo, ""
Example #18
0
def deblur(qclient, job_id, parameters, out_dir):
    """Run deblur with the given parameters

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run deblur
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job

    Notes
    -----
    The code will check if the artifact has a preprocessed_demux element, if
    not it will use the preprocessed_fastq. We prefer to work with the
    preprocessed_demux as running time will be greatly improved
    """
    out_dir = join(out_dir, 'deblur_out')
    # Step 1 get the rest of the information need to run deblur
    qclient.update_job_step(job_id, "Step 1 of 3: Collecting information")
    artifact_id = parameters['seqs-fp']
    # removing input from parameters so it's not part of the final command
    del parameters['seqs-fp']

    # Get the artifact filepath information
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    fps = artifact_info['files']

    # Step 2 generating command deblur
    if 'preprocessed_demux' in fps:
        qclient.update_job_step(
            job_id, "Step 2 of 3: Generating per sample "
            "from demux (1/2)")

        if not exists(out_dir):
            mkdir(out_dir)
        split_out_dir = join(out_dir, 'split')
        if not exists(split_out_dir):
            mkdir(split_out_dir)

        # using the same number of parallel jobs as defined by the command
        n_jobs = parameters['jobs-to-start']
        # [0] cause there should be only 1 file
        to_per_sample_files(fps['preprocessed_demux'][0],
                            out_dir=split_out_dir,
                            n_jobs=n_jobs)

        qclient.update_job_step(
            job_id, "Step 2 of 3: Generating per sample "
            "from demux (2/2)")
        out_dir = join(out_dir, 'deblured')
        cmd = generate_deblur_workflow_commands([split_out_dir], out_dir,
                                                parameters)
    else:
        qclient.update_job_step(job_id, "Step 2 of 3: Generating deblur "
                                "command")
        cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'],
                                                out_dir, parameters)

    # Step 3 execute deblur
    qclient.update_job_step(job_id, "Step 3 of 3: Executing deblur job")
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s" %
                     (std_out, std_err))
        return False, None, error_msg

    # Generating artifact
    pb = partial(join, out_dir)

    # Generate the filepaths
    final_biom = pb('final.biom')
    final_seqs = pb('final.seqs.fa')
    final_biom_16s = pb('final.only-16s.biom')
    final_seqs_na = pb('final.seqs.fa.no_artifacts')

    if not exists(final_biom_16s):
        # Create an empty table. We need to send something to Qiita that is
        # a valid BIOM, so we are going to create an empty table
        t = Table([], [], [])
        with biom_open(final_biom_16s, 'w') as f:
            t.to_hdf5(f, 'qp-deblur generated')

    if not exists(final_seqs_na):
        # Same as before, create an empty sequence file so we can send it
        with open(final_seqs_na, 'w') as f:
            f.write("")

    ainfo = [
        ArtifactInfo('deblur final table',
                     'BIOM', [(final_biom, 'biom'),
                              (final_seqs, 'preprocessed_fasta')]),
        ArtifactInfo('deblur 16S only table', 'BIOM',
                     [(final_biom_16s, 'biom'),
                      (final_seqs_na, 'preprocessed_fasta')])
    ]

    return True, ainfo, ""
Example #19
0
def alpha_correlation(qclient, job_id, parameters, out_dir):
    """generate alpha correlation calculations

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values for alpha correlation
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job
    """
    out_dir = join(out_dir, 'alpha_correlation')
    if not exists(out_dir):
        mkdir(out_dir)

    qclient.update_job_step(job_id, "Step 1 of 3: Collecting information")
    artifact_id = parameters['Alpha vectors']
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    dm_fp = artifact_info['files']['plain_text'][0]
    dm_qza = join(out_dir, 'q2-alpha-diversity.qza')
    analysis_id = artifact_info['analysis']
    metadata = qclient.get(
        "/qiita_db/analysis/%s/metadata/" % str(analysis_id))
    metadata = pd.DataFrame.from_dict(metadata, orient='index')
    metadata_fp = join(out_dir, 'metadata.txt')
    metadata.to_csv(metadata_fp, sep='\t')
    p_method = ALPHA_CORRELATION_METHODS[parameters['Correlation method']]
    o_visualization = join(out_dir, 'alpha_correlation.qzv')

    qclient.update_job_step(
        job_id, "Step 2 of 3: Converting Qiita artifacts to Q2 artifact")
    cmd = ('qiime tools import --input-path %s --output-path %s '
           '--type "SampleData[AlphaDiversity]"' % (dm_fp, dm_qza))
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error converting distance matrix:\nStd out: %s\n"
                     "Std err: %s" % (std_out, std_err))
        return False, None, error_msg

    qclient.update_job_step(
        job_id, "Step 3 of 3: Calculating alpha correlation")
    cmd = ('qiime diversity alpha-correlation --i-alpha-diversity %s '
           '--m-metadata-file %s --p-method %s --o-visualization %s' % (
               dm_qza, metadata_fp, p_method, o_visualization))

    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error in Alpha Correlation\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        return False, None, error_msg

    ainfo = [ArtifactInfo('Alpha correlation visualization',
                          'q2_visualization', [(o_visualization, 'qzv')])]
    return True, ainfo, ""
Example #20
0
def generate_insertion_trees(placements, out_dir,
                             reference_template=None,
                             reference_rename=None):
    """Generates phylogenetic trees by inserting placements into a reference

    Parameters
    ----------
    placements : dict of strings
        keys are the seqs, values are the new placements as JSON strings
    out_dir : str
        The job output directory
    reference_template : str, optional
        Filepath to the reference placement json file.
        This file can be produced via _generate_template_rename() and should be
        stored in the plugin package, because it can re used.
        If None, it falls back to the Greengenes 13.8 99% reference.
    reference_rename : str, optional
        Similar to reference_template, but a filepath to the generated python
        renaming script to undo the name scaping post guppy.
        If None, it falls back to the Greengenes 13.8 99% reference.

    Returns
    -------
    str
        The filepath of the phylogenetic insertion tree in Newick format.

    Raises
    ------
    ValueError
        If a) the given reference_template or reference_rename files do not
        exist
        b) or the guppy binary exits with non-zero return code
        c) or the given rename script exists with non-zero return code.
    """
    # test if reference file for rename script actually exists.
    file_ref_rename = qp_deblur.get_data(
        join('sepp', 'tmpl_gg13.8-99_rename-json.py'))
    if reference_rename is not None:
        file_ref_rename = reference_rename
    if not exists(file_ref_rename):
        raise ValueError("Reference rename script '%s' does not exits!" %
                         file_ref_rename)

    # create a valid placement.json file as input for guppy
    file_ref_template = qp_deblur.get_data(
        join('sepp', 'tmpl_gg13.8-99_placement.json'))
    if reference_template is not None:
        file_ref_template = reference_template
    if not exists(file_ref_template):
        raise ValueError("Reference template '%s' does not exits!" %
                         file_ref_template)
    with open(file_ref_template, 'r') as f:
        plcmnts = json.loads(f.read())

    plcmnts['placements'].extend(
        [{'p': placement, 'nm': [[sequence, 1]]}
         for sequence, placement
         in placements.items()])

    file_placements = '%s/placements.json' % out_dir
    with open(file_placements, 'w') as f:
        json.dump(plcmnts, f)

    # execute guppy
    file_tree_escaped = join(out_dir, 'insertion_tree.tre')
    std_out, std_err, return_value = system_call(
        'guppy tog %s -o %s' % (file_placements, file_tree_escaped))
    if return_value != 0:
        error_msg = ("Error running guppy:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        raise ValueError(error_msg)

    # execute node name re-labeling (to revert the escaping of names necessary
    # for guppy)
    file_tree = join(out_dir, 'insertion_tree.relabelled.tre')
    std_out, std_err, return_value = system_call(
        'cat %s | python %s > %s' %
        (file_tree_escaped, file_ref_rename, file_tree))
    if return_value != 0:
        error_msg = (("Error running %s:\n"
                      "Std out: %s\nStd err: %s")
                     % (file_ref_rename, std_out, std_err))
        raise ValueError(error_msg)

    # making sure that all branches in the generated tree have branch lenghts
    tree = TreeNode.read(file_tree)
    for node in tree.preorder(include_self=False):
        if node.length is None:
            node.length = 0.0
    tree.write(file_tree)

    return file_tree
Example #21
0
def taxa_barplot(qclient, job_id, parameters, out_dir):
    """Generate taxa barplot calculations

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values for taxa barplot
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job
    """
    out_dir = join(out_dir, 'taxa_barplot')
    if not exists(out_dir):
        mkdir(out_dir)

    qclient.update_job_step(job_id, "Step 1 of 4: Collecting information")
    artifact_id = int(parameters['BIOM table'])
    artifact_info = qclient.get("/qiita_db/artifacts/%d/" % artifact_id)
    analysis_id = artifact_info['analysis']
    metadata = qclient.get(
        "/qiita_db/analysis/%s/metadata/" % str(analysis_id))
    metadata = pd.DataFrame.from_dict(metadata, orient='index')
    metadata_fp = join(out_dir, 'metadata.txt')
    metadata.to_csv(metadata_fp, sep='\t')

    biom_qza = join(out_dir, 'q2-biom.qza')
    taxonomy_txt = join(out_dir, 'q2-taxonomy.txt')
    taxonomy_qza = join(out_dir, 'q2-taxonomy.qza')
    taxa_plot_qzv = join(out_dir, 'taxa-barplot.qzv')

    # getting the biom table so we can check for taxonomies
    biom_fp = artifact_info['files']['biom'][0]
    bt = load_table(biom_fp)
    with open(taxonomy_txt, 'w') as fp:
        fp.write('Feature ID\tTaxon\n')
        for otu_id in bt.ids('observation'):
            tax = bt.metadata(id=otu_id, axis='observation')
            if tax is None:
                error_msg = ("biom table doesn't have taxonomy")
                return False, None, error_msg
            taxonomy = '; '.join(tax['taxonomy'])
            fp.write("%s\t%s\n" % (otu_id, taxonomy))

    qclient.update_job_step(
        job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact: BIOM")
    cmd = ('qiime tools import --input-path %s --output-path %s '
           '--type "FeatureTable[Frequency]' % (biom_fp, biom_qza))

    counts = list(map(sum, bt.iter_data()))
    if min(counts) == max(counts):
        cmd += " % Properties(['uniform-sampling'])\""
    else:
        cmd += '"'
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error converting biom:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        return False, None, error_msg

    qclient.update_job_step(job_id, "Step 3 of 4: Converting Qiita artifacts "
                                    "to Q2 artifact: Taxonomy")
    cmd = ('qiime tools import --input-path %s --output-path %s '
           '--type "FeatureData[Taxonomy]"' % (taxonomy_txt, taxonomy_qza))
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error converting taxonomy:\nStd out: %s\n"
                     "Std err: %s" % (std_out, std_err))
        return False, None, error_msg

    qclient.update_job_step(job_id, "Step 4 of 4: Generating summary")
    cmd = ('qiime taxa barplot --i-table %s --i-taxonomy %s '
           '--m-metadata-file %s --o-visualization %s' % (
               biom_qza, taxonomy_qza, metadata_fp, taxa_plot_qzv))
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error generating taxonomy summary:\nStd out: %s\n"
                     "Std err: %s" % (std_out, std_err))
        return False, None, error_msg

    ainfo = [ArtifactInfo('Taxa summaries visualization', 'q2_visualization',
                          [(taxa_plot_qzv, 'qzv')])]
    return True, ainfo, ""
Example #22
0
def generate_sepp_placements(seqs, out_dir, threads, reference_phylogeny=None,
                             reference_alignment=None):
    """Generates the sepp commands

    Parameters
    ----------
    seqs : list of str
        A list of seqs to generate placements
    out_dir : str
        The job output directory
    threads : int
        Number if CPU cores to use
    reference_phylogeny : str, optional
        A filepath to an alternative reference phylogeny for SEPP.
        If None, default phylogeny is uses, which is Greengenes 13.8 99% id.
    reference_alignment : str, optional
        A filepath to an alternative reference alignment for SEPP.
        If None, default alignment is uses, which is Greengenes 13.8 99% id.

    Returns
    -------
    dict of strings
        keys are the seqs, values are the new placements as JSON strings

    Raises
    ------
    ValueError
        If run-sepp.sh does not produce expected file placements.json which is
        an indicator that something failed.
    """
    # return an empty dict if no sequences have been passed to the function
    if len(seqs) < 1:
        return {}

    # Create a multiple fasta file for all input seqs
    file_input = "%s/input.fasta" % out_dir
    with open(file_input, 'w') as fh_input:
        for seq in seqs:
            fh_input.write(">%s\n%s\n" % (seq, seq))

    # execute SEPP
    run_name = 'qiita'
    param_phylogeny = ''
    if reference_phylogeny is not None:
        param_phylogeny = ' -t %s ' % reference_phylogeny
    param_alignment = ''
    if reference_alignment is not None:
        param_alignment = ' -a %s ' % reference_alignment
    # SEPP writes output into the current working directory (cwd), therefore
    # we here first need to store the cwd, then move into the output directory,
    # perform SEPP and move back to the stored cwd for a clean state
    curr_pwd = environ['PWD']
    std_out, std_err, return_value = system_call(
        'cd %s && run-sepp.sh %s %s -x %s %s %s; cd %s' %
        (out_dir, file_input, run_name, threads,
         param_phylogeny, param_alignment, curr_pwd))

    # parse placements from SEPP results
    file_placements = '%s/%s_placement.json' % (out_dir, run_name)
    if exists(file_placements):
        with open(file_placements, 'r') as fh_placements:
            plcmnts = json.loads(fh_placements.read())
            return {p['nm'][0][0]: p['p'] for p in plcmnts['placements']}
    else:
        # due to the wrapper style of run-sepp.sh the actual exit code is never
        # returned and we have no way of finding out which sub-command failed
        # Therefore, we can only assume that something went wrong by not
        # observing the expected output file.
        # If the main SEPP program fails, it reports some information in two
        # files, whoes content we can read and report
        file_stderr = '%s/sepp-%s-err.log' % (out_dir, run_name)
        if exists(file_stderr):
            with open(file_stderr, 'r') as fh_stderr:
                std_err = fh_stderr.readlines()
        file_stdout = '%s/sepp-%s-out.log' % (out_dir, run_name)
        if exists(file_stdout):
            with open(file_stdout, 'r') as fh_stdout:
                std_out = fh_stdout.readlines()
        error_msg = ("Error running run-sepp.sh:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        raise ValueError(error_msg)
Example #23
0
def filter_samples(qclient, job_id, parameters, out_dir):
    """Filter samples from a table

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values for filter samples
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job
    """
    out_dir = join(out_dir, 'filter_samples')
    if not exists(out_dir):
        mkdir(out_dir)

    qclient.update_job_step(job_id, "Step 1 of 4: Collecting information")
    artifact_id = int(parameters['BIOM table'])
    p_max_frequency = int(
        parameters['Maximum feature frequency across samples'])
    p_max_features = int(parameters['Maximum features per sample'])
    p_min_frequency = int(
        parameters['Minimum feature frequency across samples'])
    p_min_features = int(parameters['Minimum features per sample'])
    p_where = parameters['SQLite WHERE-clause']

    artifact_info = qclient.get("/qiita_db/artifacts/%d/" % artifact_id)
    analysis_id = artifact_info['analysis']
    metadata = qclient.get(
        "/qiita_db/analysis/%s/metadata/" % str(analysis_id))
    metadata = pd.DataFrame.from_dict(metadata, orient='index')
    metadata_fp = join(out_dir, 'metadata.txt')
    metadata.to_csv(metadata_fp, sep='\t')

    # getting just the biom file, [0] it should be only one
    biom_ifp = artifact_info['files']['biom'][0]
    biom_ofp = join(out_dir, 'biom.qza')

    qclient.update_job_step(
        job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact")
    # converting biom
    cmd = ('qiime tools import --input-path %s --output-path %s '
           '--type "FeatureTable[Frequency]' % (biom_ifp, biom_ofp))
    b = load_table(biom_ifp)
    counts = list(map(sum, b.iter_data()))
    if min(counts) == max(counts):
        cmd += " % Properties(['uniform-sampling'])\""
    else:
        cmd += '"'
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error converting biom:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        return False, None, error_msg

    qclient.update_job_step(job_id, "Step 3 of 4: Filtering")
    filter_ofp = join(out_dir, 'biom_filtered.qza')
    cmd = ('qiime feature-table filter-samples --m-metadata-file %s '
           '--o-filtered-table %s --p-max-frequency %d --p-max-features %d '
           '--p-min-frequency %d --p-min-features %d --i-table %s' % (
               metadata_fp, filter_ofp, p_max_frequency, p_max_features,
               p_min_frequency, p_min_features, biom_ofp))
    if p_where != '':
        cmd += ' --p-where "%s"' % p_where
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error in filtering samples in biom\nStd out: %s\n"
                     "Std err: %s" % (std_out, std_err))
        return False, None, error_msg

    qclient.update_job_step(
        job_id, "Step 4 of 4: Converting Q2 to Qiita artifacts")
    fdir = join(out_dir, 'filter_samples')
    ffp = join(fdir, 'feature-table.biom')
    cmd = "qiime tools export --output-dir %s %s" % (fdir, filter_ofp)
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error in Q2 -> Qiita conversion:\nStd out: "
                     "%s\nStd err: %s" % (std_out, std_err))
        return False, None, error_msg

    # After calling Qiime2, the taxonomy has been dropped from the BIOM table
    # Re-add here
    orig = load_table(biom_ifp)
    res = load_table(ffp)

    metadata = {i: orig.metadata(i, axis='observation')
                for i in res.ids(axis='observation')}
    res.add_metadata(metadata, axis='observation')

    res_fp = join(out_dir, 'filtered.biom')
    with biom_open(res_fp, 'w') as bf:
        res.to_hdf5(bf, "Qiita's Qiime2 plugin")

    ainfo = [ArtifactInfo('o-table', 'BIOM', [(res_fp, 'biom')])]
    return True, ainfo, ""
Example #24
0
def spades_to_array(directory, output_dir, prefix_to_name, url, job_id,
                    params):
    environment = environ["ENVIRONMENT"]
    ppn = params["threads"]
    memory = params["memory"]

    # 1. create file list
    num_samples = len(prefix_to_name)
    if num_samples > 1024:
        raise ValueError('This preparation has more than 1024 samples, '
                         'which is the limit; please split in multiple.')

    files = []
    for prefix, sample_name in prefix_to_name.items():
        fps = sorted(glob(join(directory, prefix + '*')))
        # this should never occur but better to confirm
        if len(fps) != 2:
            error_msg = f'Expected two files to match "{prefix}"'
            raise ValueError(error_msg)
        files.append('\t'.join([fps[0], fps[1], prefix]))

    # 2. format main comand
    command = (f'spades.py --{params["type"]} -t {ppn} -m {memory} '
               f'-k {params["k-mers"]} -o $OUTDIR/$SNAME')
    if params['merging'].startswith('flash '):
        # get read length quickly; note that we are going to assume
        # that (1) the forward and reverse are the same length and (2)
        # all file pairs have the same length so only calculate once
        fp = glob(join(directory, list(prefix_to_name)[0] + '*'))[0]
        std_out, std_err, return_value = system_call(
            f'zcat -c {fp} | head -n 2')
        if return_value != 0:
            error_msg = (f"Error uncompressing: {fp}\n"
                         f"Std out: {std_out}\nStd err: {std_err}\n")
            raise ValueError(error_msg)
        read_length = len(std_out.split('\n')[1])
        percentage = int(params['merging'][6:-1]) / 100
        overlap = int(read_length * percentage)

        command = (
            # flash
            f'flash --threads {ppn} --max-overlap={overlap} '
            '--output-directory $OUTDIR '
            '--output-prefix="$SNAME" ${FWD} ${REV} '
            '--max-mismatch-density=0.1 > $OUTDIR/${SNAME}.flash.log 2>&1'
            ' && '
            # spades
            f'{command} '
            '--merge $OUTDIR/${SNAME}.extendedFrags.fastq '
            '-1 $OUTDIR/${SNAME}.notCombined_1.fastq '
            '-2 $OUTDIR/${SNAME}.notCombined_2.fastq')
    else:
        command = '%s -1 ${FWD} -2 ${REV}' % command

    # 3. create qsub for array submission
    mqsub = [
        '#!/bin/bash', '#PBS -M [email protected]', f'#PBS -N {job_id}',
        f'#PBS -l nodes=1:ppn={ppn}', f'#PBS -l walltime={WALLTIME}',
        f'#PBS -l mem={memory}g',
        f'#PBS -o {output_dir}/{job_id}' + '_${PBS_ARRAYID}.log',
        f'#PBS -e {output_dir}/{job_id}' + '_${PBS_ARRAYID}.err',
        f'#PBS -t 1-{num_samples}%{MAX_RUNNING}',
        '#PBS -l epilogue=/home/qiita/qiita-epilogue.sh', f'cd {output_dir}',
        f'{environment}', f'OUTDIR={output_dir}/', 'date', 'hostname',
        'echo ${PBS_JOBID} ${PBS_ARRAYID}', 'offset=${PBS_ARRAYID}',
        'args=$(head -n $offset ${OUTDIR}/files_to_process.txt| tail -n 1)',
        "FWD=$(echo -e $args | awk '{ print $1 }')",
        "REV=$(echo -e $args | awk '{ print $2 }')",
        "SNAME=$(echo -e $args | awk '{ print $3 }')", f'{command}', 'date'
    ]

    # 4. create qsub to finish job in Qiita
    fqsub = [
        '#!/bin/bash', '#PBS -M [email protected]',
        f'#PBS -N merge-{job_id}', '#PBS -l nodes=1:ppn=1',
        f'#PBS -l walltime={FINISH_WALLTIME}', f'#PBS -l mem={FINISH_MEMORY}',
        f'#PBS -o {output_dir}/finish-{job_id}.log',
        f'#PBS -e {output_dir}/finish-{job_id}.err',
        '#PBS -l epilogue=/home/qiita/qiita-epilogue.sh', f'cd {output_dir}',
        f'{environment}', 'date', 'hostname', 'echo $PBS_JOBID',
        f'finish_qp_spades {url} {job_id} {output_dir}\n'
        "date"
    ]

    # write files
    with open(join(output_dir, 'files_to_process.txt'), 'w') as f:
        f.write('\n'.join(files))
    main_qsub_fp = join(output_dir, f'{job_id}.qsub')
    with open(main_qsub_fp, 'w') as job:
        job.write('\n'.join(mqsub))
        job.write('\n')
    finish_qsub_fp = join(output_dir, f'{job_id}.finish.qsub')
    with open(finish_qsub_fp, 'w') as job:
        job.write('\n'.join(fqsub))
        job.write('\n')

    return main_qsub_fp, finish_qsub_fp
Example #25
0
def emperor(qclient, job_id, parameters, out_dir):
    """generate emperor plot calculations

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values for pcoa
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job
    """
    out_dir = join(out_dir, 'emperor')
    if not exists(out_dir):
        mkdir(out_dir)

    qclient.update_job_step(job_id, "Step 1 of 4: Collecting information")
    artifact_id = parameters['Ordination results']
    p_custom_axis = parameters['Custom axis']
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    pcoa_fp = artifact_info['files']['plain_text'][0]

    analysis_id = artifact_info['analysis']
    metadata = qclient.get(
        "/qiita_db/analysis/%s/metadata/" % str(analysis_id))
    metadata = pd.DataFrame.from_dict(metadata, orient='index')
    metadata_fp = join(out_dir, 'metadata.txt')
    metadata.to_csv(metadata_fp, sep='\t')

    pcoa_qza = join(out_dir, 'q2-pcoa.qza')
    emperor_qzv = join(out_dir, 'q2-emperor.qzv')

    qclient.update_job_step(
        job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact")
    cmd = ('qiime tools import --input-path %s --output-path %s '
           '--type "PCoAResults"' % (pcoa_fp, pcoa_qza))
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error converting distance matrix:\nStd out: %s\n"
                     "Std err: %s" % (std_out, std_err))
        return False, None, error_msg

    qclient.update_job_step(
        job_id, "Step 3 of 4: Generating Emperor plot")

    cmd = ('qiime emperor plot --i-pcoa %s --o-visualization %s '
           '--m-metadata-file %s' % (pcoa_qza, emperor_qzv, metadata_fp))
    if p_custom_axis is not None and p_custom_axis not in ['None', '']:
        cmd += ' --p-custom-axis "%s"' % p_custom_axis

    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error in PCoA\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        return False, None, error_msg

    ainfo = [ArtifactInfo('Emperor visualization', 'q2_visualization',
                          [(emperor_qzv, 'qzv')])]
    return True, ainfo, ""
Example #26
0
def _validate_multiple(qclient, job_id, prep_info, files, atype, test=False):
    """Validate and fix a new 'SFF', 'FASTQ', 'FASTA' or 'FASTA_Sanger' artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    files : dict of {str: list of str}
        The files to add to the new artifact, keyed by filepath type
    atype: str
        The type of the artifact
    test: bolean, optional
        If True this is being called by a test

    Returns
    -------
    dict
        The results of the job
    """
    qclient.update_job_step(job_id, "Step 2: Validating '%s' files" % atype)
    req_fp_types, opt_fp_types = FILEPATH_TYPE_DICT[atype]
    all_fp_types = req_fp_types | opt_fp_types

    # Check if there is any filepath type that is not supported
    unsupported_fp_types = set(files) - all_fp_types
    if unsupported_fp_types:
        error_msg = ("Filepath type(s) %s not supported by artifact "
                     "type %s. Supported filepath types: %s" %
                     (', '.join(unsupported_fp_types), atype, ', '.join(
                         sorted(all_fp_types))))
        return False, None, error_msg

    # Check if the run_prefix column is present in the prep info
    offending = {}
    types_seen = set()
    if 'run_prefix' in prep_info[next(iter(prep_info))]:
        # We can potentially have more than one lane in the prep information
        # so check that the provided files are prefixed with the values in
        # the run_prefix column
        run_prefixes = set(v['run_prefix'] for k, v in prep_info.items())
        num_prefixes = len(run_prefixes)

        # Check those filepath types that are required
        for ftype, t_files in files.items():
            # SFF is an special case cause we can have multiple files with
            # the same prefix
            if num_prefixes != len(t_files) and atype != 'SFF':
                offending[ftype] = (
                    "The number of provided files (%d) doesn't match the "
                    "number of run prefix values in the prep info (%d): %s" %
                    (len(t_files), num_prefixes, ', '.join(
                        basename(f) for f in t_files)))
            else:
                rps = []
                fps = []
                for fp in t_files:
                    bn = basename(fp)
                    found = [rp for rp in run_prefixes if bn.startswith(rp)]
                    if found:
                        rps.extend(found)
                    else:
                        fps.append(bn)
                if fps:
                    offending[ftype] = (
                        "The provided files do not match the run prefix "
                        "values in the prep information: %s" % ', '.join(fps))
                else:
                    rps = run_prefixes - set(rps)
                    if rps:
                        offending[ftype] = (
                            "The following run prefixes in the prep "
                            "information file do not match any file: %s" %
                            ', '.join(rps))

            types_seen.add(ftype)
    else:
        # If the run prefix column is not provided, we only allow a single
        # lane, so check that we have a single file for each provided
        # filepath type
        for ftype, t_files in files.items():
            if len(t_files) != 1:
                offending[ftype] = (
                    "Only one file per type is allowed. Please provide the "
                    "column 'run_prefix' if you need more than one file per "
                    "type: %s" % ', '.join(basename(fp) for fp in t_files))

            types_seen.add(ftype)

    # Check that all required filepath types where present
    missing = req_fp_types - types_seen
    if missing:
        error_msg = ("Missing required filepath type(s): %s" %
                     ', '.join(missing))
        return False, None, error_msg

    # Check if there was any offending file
    if offending:
        error_list = ["%s: %s" % (k, v) for k, v in offending.items()]
        error_msg = ("Error creating artifact. Offending files:\n%s" %
                     '\n'.join(error_list))
        return False, None, error_msg

    # Everything is ok
    filepaths = []
    for fps_type, fps in files.items():
        for fp in fps:
            if fps_type in MUST_GZ:
                fp, error_msg = _gzip_file(fp, test)
                if error_msg is not None:
                    return False, None, error_msg
            filepaths.append((fp, fps_type))

    # let's count sequences; this is basically the last check
    errors = []
    artifact_information = []
    if atype not in FILEPATH_TYPE_NO_FQTOOLS:
        for fp, fpt in filepaths:
            cmd = f'fqtools count {fp}'
            std_out, std_err, return_value = system_call(cmd)
            fn = basename(fp)
            if std_err or return_value != 0:
                errors.append(f'{fn}: {std_err}')
            else:
                reads = int(std_out)
                artifact_information.append({
                    'filename': fn,
                    'reads': reads,
                    'file_type': fpt
                })

        if errors:
            raise ValueError('Found errors: \n %s' % ''.join(errors))
        dname = dirname(fp)
        pd.DataFrame(artifact_information).to_csv(
            f'{dname}/qtp-sequencing-validate-data.csv', index=False)

    return True, [ArtifactInfo(None, atype, filepaths)], ""
Example #27
0
 def test_system_call(self):
     obs_out, obs_err, obs_val = system_call("pwd")
     self.assertEqual(obs_out, "%s\n" % getcwd())
     self.assertEqual(obs_err, "")
     self.assertEqual(obs_val, 0)