Exemple #1
0
def check_flowgram_ali_exe():
    """Check if we have a working FlowgramAligner"""
    ali_exe = get_flowgram_ali_exe()

    if which(ali_exe) is None:
        raise ApplicationNotFoundError("The alignment program %s is not "
                                       "accessible via the PATH environment "
                                       "variable." % ali_exe)

    # test if its callable and actually works
    command = "%s -h" % ali_exe
    proc = Popen(command,
                 shell=True,
                 universal_newlines=True,
                 stdout=PIPE,
                 stderr=STDOUT)

    if (proc.wait() != 0):
        raise ApplicationError(
            "Calling %s failed. Check permissions and that it is in fact an executable."
            % ali_exe)

    result = proc.stdout.read()
    # check that the help string looks correct
    if (not result.startswith("Usage")):
        raise ApplicationError(
            "Calling %s failed. Check permissions and that it is in fact an executable."
            % ali_exe)
    return True
    def _get_result_paths(self, data):
        """ Set the result paths """

        result = {}

        inp_file_name = str(self.Parameters['--query_NAST'].Value)
        inp_file_name = inp_file_name.rstrip('"')
        inp_file_name = inp_file_name.lstrip('"')

        exec_dir = self.Parameters['--exec_dir']
        if exec_dir.isOn():
            exec_dir = str(exec_dir.Value)
            exec_dir = exec_dir.lstrip('"')
            exec_dir = exec_dir.rstrip('"')

            if inp_file_name[0] == '/':
                # path is already absolute
                pass
            else:
                inp_file_name = exec_dir + "/" + inp_file_name

        if not exists(inp_file_name + ".CPS.CPC"):
            raise ApplicationError("Calling ChimeraSlayer failed.")

        result['CPS'] = ResultPath(Path=inp_file_name + ".CPS.CPC",
                                   IsWritten=True)
        return result
Exemple #3
0
 def _get_base_command(self):
     if self._subcommand is None:
         raise ApplicationError('_subcommand has not been set.')
     # prevent append multiple subcommand
     if not self._command.endswith(self._subcommand):
         self._command = self._command_delimiter.join(
             [self._command, self._subcommand])
     return super()._get_base_command()
Exemple #4
0
def main():
    """run denoiser on input flowgrams"""
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sff_files = opts.sff_fps

    for f in sff_files:
        if (not exists(f)):
            option_parser.error(('Flowgram file path does not exist:\n %s \n' +
                                 'Pass a valid one via -i.') % f)
    outdir = opts.output_dir

    create_dir(outdir, fail_on_exist=not opts.force)

    log_fh = None

    if (not (opts.primer or opts.map_fname)):
        raise ApplicationError("Either mapping file or primer required")
    # Read primer from Meta data file if not set on command line
    if not opts.primer:
        mapping_data, header, comments = \
            parse_mapping_file(open(opts.map_fname, "U"))

        index = header.index("LinkerPrimerSequence")
        all_primers = set(array(mapping_data)[:, index])

        if len(all_primers) != 1:
            raise ValueError(
                "Currently only data sets with one primer are allowed.\n" +
                "Make separate mapping files with only one primer, re-run split_libraries and\n"
                + "denoise with each split_library output separately.")
        primer = list(all_primers)[0]
        last_char = primer[-1]
        if (last_char not in "ACGT"):
            raise ValueError("We currently do not support primer with " +
                             "degenerate bases at it's 3' end.")

    else:
        primer = opts.primer

    centroids, cluster_mapping = fast_denoiser(opts.sff_fps,
                                               opts.fasta_fp,
                                               outdir,
                                               opts.num_cpus,
                                               primer,
                                               titanium=opts.titanium)

    # store mapping file and centroids
    result_otu_path = '%s/denoised_clusters.txt' % outdir
    of = open(result_otu_path, 'w')
    for i, cluster in cluster_mapping.iteritems():
        of.write('%s\t%s\n' % (str(i), '\t'.join(cluster)))
    of.close()

    result_fasta_path = '%s/denoised_seqs.fasta' % outdir
    oh = open(result_fasta_path, 'w')
    write_Fasta_from_name_seq_pairs(centroids, oh)
    def _input_as_parameters(self, data):
        """ Set the input paths (a NAST aligned fasta filepath)
        """
        # The list of values which can be passed on a per-run basis
        allowed_values = ['--query_NAST', '--db_NAST', '--db_FASTA', '-R']

        unsupported_parameters = set(data.keys()) - set(allowed_values)
        if unsupported_parameters:
            raise ApplicationError(
                "Unsupported parameter(s) passed when calling ChimeraSlayer: %s" %
                ' '.join(unsupported_parameters))

        return ''
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = mkdtemp(
            dir=tmp_dir,
            prefix='qiime_parallel_taxonomy_assigner_tests_',
            suffix='')
        self.dirs_to_remove.append(self.test_out)

        # Temporary input file
        fd, self.tmp_seq_filepath = mkstemp(
            dir=self.test_out,
            prefix='qiime_parallel_taxonomy_assigner_tests_input',
            suffix='.fasta')
        close(fd)
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(rdp_test_seqs)
        seq_file.close()
        self.files_to_remove.append(self.tmp_seq_filepath)

        self.id_to_taxonomy_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy',
            suffix='.txt',
            dir=tmp_dir)
        self.id_to_taxonomy_file.write(rdp_id_to_taxonomy)
        self.id_to_taxonomy_file.seek(0)

        self.reference_seqs_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs',
            suffix='.fasta',
            dir=tmp_dir)
        self.reference_seqs_file.write(rdp_reference_seqs)
        self.reference_seqs_file.seek(0)

        jar_fp = getenv('RDP_JAR_PATH')
        jar_basename = basename(jar_fp)
        if '2.2' not in jar_basename:
            raise ApplicationError(
                "RDP_JAR_PATH does not point to version 2.2 of the "
                "RDP Classifier.")

        initiate_timeout(60)
def denoise_seqs(sff_fps,
                 fasta_fp,
                 tmpoutdir,
                 preprocess_fp=None,
                 cluster=False,
                 num_cpus=1,
                 squeeze=True,
                 percent_id=0.97,
                 bail=1,
                 primer="",
                 low_cutoff=3.75,
                 high_cutoff=4.5,
                 log_fp="denoiser.log",
                 low_memory=False,
                 verbose=False,
                 error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat',
                 max_num_rounds=None,
                 titanium=False,
                 checkpoint_fp=None):
    """The main routine to denoise flowgrams"""

    # abort if binary is missing
    check_flowgram_ali_exe()

    if verbose:
        # switch of buffering for log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)
    else:
        log_fh = None

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Preprocess dir: %s\n" % preprocess_fp)
        if checkpoint_fp:
            log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp)
        log_fh.write("Primer sequence: %s\n" % primer)
        log_fh.write("Running on cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Low cut-off: %.2f\n" % low_cutoff)
        log_fh.write("High cut-off: %.2f\n" % high_cutoff)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    # Phase I - clean up and truncate input sff
    if (checkpoint_fp):
        if (preprocess_fp):
            # skip preprocessing as we should have data
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        else:
            raise ApplicationError(
                "Resuming from checkpoint requires --preprocess option")

    else:
        if (preprocess_fp):
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        elif (cluster):
            preprocess_on_cluster(sff_fps,
                                  log_fp,
                                  fasta_fp=fasta_fp,
                                  out_fp=tmpoutdir,
                                  verbose=verbose,
                                  squeeze=squeeze,
                                  primer=primer)
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(tmpoutdir)
        else:
            (deprefixed_sff_fp, l, mapping, seqs) = \
                preprocess(
                    sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir,
                    verbose=verbose, squeeze=squeeze, primer=primer)

        # preprocessor writes into same file, so better jump to end of file
        if verbose:
            log_fh.close()
            log_fh = open(tmpoutdir + "/" + log_fp, "a", 0)

    # phase II:
    # use prefix map based clustering as initial centroids and greedily
    # add flowgrams to clusters with a low threshold

    (new_sff_file, bestscores, mapping) = \
        greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l,
                          log_fh, num_cpus=num_cpus, on_cluster=cluster,
                          bail_out=bail, pair_id_thresh=percent_id,
                          threshold=low_cutoff, verbose=verbose,
                          fast_method=not low_memory,
                          error_profile=error_profile,
                          max_num_rounds=max_num_rounds,
                          checkpoint_fp=checkpoint_fp)

    # phase III phase:
    # Assign seqs to nearest existing centroid with high threshold
    secondary_clustering(new_sff_file,
                         mapping,
                         bestscores,
                         log_fh,
                         verbose=verbose,
                         threshold=high_cutoff)
    remove(new_sff_file)
    if (verbose):
        log_fh.write("Finished clustering\n")
        log_fh.write("Writing Clusters\n")
        log_fh.write(make_stats(mapping) + "\n")
    store_clusters(mapping, deprefixed_sff_fp, tmpoutdir)
    store_mapping(mapping, tmpoutdir, "denoiser")
def filter_with_flowgram(id,
                         flowgram,
                         flowgrams,
                         header,
                         ids,
                         num_flows,
                         bestscores,
                         log_fh,
                         outdir="/tmp/",
                         threshold=3.75,
                         num_cpus=32,
                         fast_method=True,
                         on_cluster=False,
                         mapping=None,
                         spread=[],
                         verbose=False,
                         pair_id_thresh=0.97,
                         client_sockets=[],
                         error_profile=DENOISER_DATA_DIR +
                         'FLX_error_profile.dat'):
    """Filter all files in flows_filename with flowgram and split according to threshold.

    id: The flowgram identifier of the master flowgram of this round

    flowgram: This flowgram is used to filter all the other flowgrams

    flowgrams: iterator containing the flowgrams to be filtered

    header: a valid sff.txt header

    ids: this list marks the active flowgrams, i.e. flowgrams that are unclustered

    num_flows: Number of flows remaining in the current round

    bestscores: dictionary that stores for each unclustered flowgram the best
                score it has to to one of the centroids previously seen
                and the id of the centroid. Used in the second denoising phase.

    outdir: directory where intermediate and result files go

    threshold: Filtering threshold

    num_cpus: number of cpus to run on, if on_cluster == True

    fast_method: Boolean value for fast denoising with lots of memory

    on_cluster: Boolean flag for local vs cluster

    mapping: the current cluster mapping

    spread: worker processing throughput

    error_profile: Path to error profile *.dat file


    Implementation detail:
    The iterator behind 'flowgrams' is big and thus we want to keep its traversals
    at a minimum. The naive implementation of this filter function would traverse the
    iterator once to create the input file for the alignment routine, then a second
    time to do the actual filtering. To get rid of the second run through the iterator,
    we keep a list (in fact a dict) of active 'ids' and do the filtering only in the next
    round. A cleaner but still fast solution would be great, as this definitly poses a
    pitfall for future modifications.

    Returns filename of file containing all non-filtered flows and the number of flows
    """
    if verbose:
        log_fh.write("Filtering with %s: %d flowgrams\n" % (id, num_flows))

    # set up the flowgram storage
    if (not fast_method):
        fc = FlowgramContainerFile(header, outdir)
    else:
        fc = FlowgramContainerArray()

    # calculate distance scores
    if on_cluster:
        (scores, names, flowgrams) =\
            get_flowgram_distances_on_cluster(
                id, flowgram, flowgrams, fc, ids, num_cpus,
                num_flows, spread=spread, client_sockets=client_sockets)
    else:
        (scores, names, flowgrams) =\
            get_flowgram_distances(
                id, flowgram, flowgrams, fc, ids, outdir=outdir,
                error_profile=error_profile)

    # shortcut for non-matching flowgrams
    survivors = filter(
        lambda a_b: a_b[0] < threshold or a_b[1] >= pair_id_thresh, scores)
    if (len(survivors) == 0):
        # put it in its own cluster
        # and remove it from any further searches
        if (id in bestscores):
            del (bestscores[id])
        del (ids[id])
        return (flowgrams, num_flows - 1)

    # Do the filtering
    non_clustered_ctr = 0
    for ((score, pair_id), name) in zip(scores, names):
        if (score < threshold or name == id or pair_id >= pair_id_thresh):
            # make sure the original flowgram gets into this cluster
            del (ids[name])
            if (name in bestscores):
                del (bestscores[name])
            if (id != name):
                # update the mapping information
                mapping[id].extend(mapping[name])
                mapping[id].append(name)
                # delete the old cluster from the mapping
                del (mapping[name])
        else:
            non_clustered_ctr += 1
            # keep track of the best match of this guy to any centroid
            if (name not in bestscores or score < bestscores[name][1]):
                bestscores[name] = (id, score)

    # Some extra safety that we are not missing anything
    if (len(ids) != non_clustered_ctr or len(bestscores) != non_clustered_ctr):
        raise ApplicationError("filterWithFlowgram failed")

    return (flowgrams, non_clustered_ctr)