Example #1
0
def set_sff_trimpoints_with_sfftools(
        sff_dir, technical_lengths, sffinfo_path='sffinfo', sfffile_path='sfffile',
        debug=False):
    """Set trimpoints to end of technical read for all SFF files in directory.

    This function essentially provides the reference implementation.
    It uses the official sfftools from Roche to process the SFF files.
    """
    if not (exists(sffinfo_path) or which(sffinfo_path)):
        raise ApplicationNotFoundError(
            'sffinfo executable not found. Is it installed and in your $PATH?')
    if not (exists(sfffile_path) or which(sfffile_path)):
        raise ApplicationNotFoundError(
            'sfffile executable not found. Is it installed and in your $PATH?')

    for lib_id, sff_fp in get_per_lib_sff_fps(sff_dir):
        try:
            readlength = technical_lengths[lib_id]
        except KeyError:
            continue

        sffinfo_args = [sffinfo_path, '-s', sff_fp]
        if debug:
            print "Running sffinfo command %s" % sffinfo_args
        sffinfo_output_file = TemporaryFile()
        check_call(sffinfo_args, stdout=sffinfo_output_file)
        sffinfo_output_file.seek(0)

        seqlengths = {}
        for line in sffinfo_output_file:
            if line.startswith('>'):
                fields = line[1:].split()
                seq_len = fields[1].split('=')[1]
                seqlengths[fields[0]] = seq_len

        trim_fp = sff_fp + '.trim'
        trim_file = open(trim_fp, 'w')
        for id_, length in seqlengths.items():
            curr_length = int(seqlengths[id_])
            # Sfftools use 1-based index
            left_trim = readlength + 1
            # Key sequence not included in FASTA length
            right_trim = curr_length + 4
            if curr_length > left_trim:
                trim_file.write(
                    "%s\t%s\t%s\n" % (id_, left_trim, right_trim))
            else:
                stderr.write(
                    'Rejected read %s with trim points %s and %s (orig '
                    'length %s)' % (id_, left_trim, curr_length, length))
        trim_file.close()

        trimmed_sff_fp = sff_fp + '.trimmed'
        sfffile_args = [
            sfffile_path, '-t', trim_fp, '-o', trimmed_sff_fp, sff_fp]
        if debug:
            print "Running sfffile command:", sfffile_args
        check_call(sfffile_args, stdout=open(devnull, 'w'))
        remove(sff_fp)
        rename(trimmed_sff_fp, sff_fp)
Example #2
0
 def _error_on_missing_application(self, params):
     """ Raise an ApplicationNotFoundError if the app is not accessible
     """
     if not app_path('RNAforester'):
         raise ApplicationNotFoundError(
             "Cannot find RNAforester. Is it installed? Is it in your path?"
         )
     if not app_path('RNAshapes'):
         raise ApplicationNotFoundError(
             "Cannot find RNAshapes. Is it installed? Is it in your path?")
Example #3
0
 def _error_on_missing_application(self, params):
     """Raise an ApplicationNotFoundError if the app is not accessible
     """
     command = self._get_jar_fp()
     if not exists(command):
         raise ApplicationNotFoundError("Cannot find jar file. Is it installed? Is $RDP_JAR_PATH"+\
          " set correctly?")
Example #4
0
def raise_gdata_not_found_error(*args, **kwargs):
    raise ApplicationNotFoundError(
        "gdata cannot be found.\nIs it installed? "
        "Is it in your $PYTHONPATH?\nThis is an optional QIIME "
        "dependency, but is required if you plan to use QIIME's remote "
        "mapping file features. For more information, please see "
        "http://qiime.org/install/install.html.")
Example #5
0
def check_flowgram_ali_exe():
    """Check if we have a working FlowgramAligner"""
    ali_exe = get_flowgram_ali_exe()

    if which(ali_exe) is None:
        raise ApplicationNotFoundError("The alignment program %s is not "
                                       "accessible via the PATH environment "
                                       "variable." % ali_exe)

    # test if its callable and actually works
    command = "%s -h" % ali_exe
    proc = Popen(command,
                 shell=True,
                 universal_newlines=True,
                 stdout=PIPE,
                 stderr=STDOUT)

    if (proc.wait() != 0):
        raise ApplicationError(
            "Calling %s failed. Check permissions and that it is in fact an executable."
            % ali_exe)

    result = proc.stdout.read()
    # check that the help string looks correct
    if (not result.startswith("Usage")):
        raise ApplicationError(
            "Calling %s failed. Check permissions and that it is in fact an executable."
            % ali_exe)
    return True
Example #6
0
    def _error_on_missing_application(self,params):
        """Raise an ApplicationNotFoundError if the app is not accessible

        In this case, checks for the java runtime and the RDP jar file.
        """
        if not (os.path.exists('java') or app_path('java')):
            raise ApplicationNotFoundError(
                "Cannot find java runtime. Is it installed? Is it in your "
                "path?")
        jar_fp = self._get_jar_fp()
        if jar_fp is None:
            raise ApplicationNotFoundError(
                "JAR file not found in current directory and the RDP_JAR_PATH "
                "environment variable is not set.  Please set RDP_JAR_PATH to "
                "the full pathname of the JAR file.")
        if not os.path.exists(jar_fp):
            raise ApplicationNotFoundError(
                "JAR file %s does not exist." % jar_fp)
Example #7
0
    def test_blastall_fp(self):
        """blastall_fp is set to a valid path"""
        
        blastall = self.config["blastall_fp"]
        if not self.config["blastall_fp"].startswith("/"):
            #path is relative, figure out absolute path
            blast_all = app_path(blastall)
            if not blast_all:
                raise ApplicationNotFoundError("blastall_fp set to %s, but is not in your PATH. Either use an absolute path to or put it in your PATH." % blastall)
            self.config["blastall_fp"] = blast_all

        test_qiime_config_variable("blastall_fp", self.config, self, X_OK)
Example #8
0
def submit_jobs(commands, prefix):
    """submit jobs using exe pointed to by cluster_jobs_fp.

    commands: List of commands (strings) that should be executed

    prefix: A uniq prefix used to name submit script
"""
    qiime_config = load_qiime_config()
    CLUSTER_JOBS_SCRIPT = qiime_config['cluster_jobs_fp']

    if not CLUSTER_JOBS_SCRIPT:
        raise ApplicationNotFoundError(
            "cluster_jobs_fp not set in config file!")
    if not (exists(CLUSTER_JOBS_SCRIPT) or which(CLUSTER_JOBS_SCRIPT)):
        raise ApplicationNotFoundError(
            "cluster_jobs_fp not in $PATH or provided as full path!")

    outfilename = join(get_qiime_temp_dir(), "%s_commands.txt" % prefix)
    fh = open(outfilename, "w")
    fh.write("\n".join(commands))
    fh.close()
    cmd = '%s -ms %s %s' % (CLUSTER_JOBS_SCRIPT, outfilename, prefix)
    system(cmd)
    remove(outfilename)
Example #9
0
def wait_for_cluster_ids(ids, interval=10):
    """Puts process to sleep until jobs with ids are done.

    ids:  list of ids to wait for

    interval: time to sleep in seconds

    NOT USED ANYMORE
    """
    if which("qstat"):
        for id in ids:
            while(getoutput("qstat %s" % id).startswith("Job")):
                sleep(interval)
    else:
        raise ApplicationNotFoundError("qstat not available. Is it installed?\n" +
                                       "This test may fail if not run on a cluster.")
Example #10
0
def submit_jobs(filenames, verbose=False):
    """Submit jobs in filenames.

    filenames: list of prepared qsub job scripts, ready to be submitted

    verbose: a binary verbose flag
    """
    if not which("qsub"):
        raise ApplicationNotFoundError("qsub not found. Can't submit jobs.")

    for file in filenames:
        command = 'qsub %s' % file
        result = Popen(command, shell=True, universal_newlines=True,
                       stdout=PIPE, stderr=STDOUT).stdout.read()
        if verbose:
            print result
Example #11
0
def run_pyfeast(data, labels, features, method='MIM', n_select=15):
    """
        run_pyfeast(data, labels, method)
        @data - numpy data (dense)
        @labels - vector of class labels (discrete)
        @features - list of feature names
        @method - feature selection method
        @n_select - number of features to select

        The feature selection method is based off of the FEAST 
        C variable selection toolbox. 

        Reference:
        Gavin Brown, Adam Pocock, Ming-Jie Zhao, and Mikel Lujan, 
            "Conditional Likelihood Maximisation: A Unifying Framework 
            for Information Theoretic Feature Selection," Journal of 
            Machine Learning Research, vol. 13, pp. 27--66, 2012.
            (http://jmlr.csail.mit.edu/papers/v13/brown12a.html)
    """

    try:
        import feast
    except ImportError:
        raise ApplicationNotFoundError(
            "Error loading the PyFeast module. It is likely that you do not have PyFeast installed locally."
        )

    try:
        fs_method = getattr(feast, method)
    except AttributeError:
        raise AttributeError(
            "Unknown feature selection method is being specified for PyFeast. Make sure the feature selection method being selected is a valid one. "
        )

    if len(data.transpose()) < n_select:
        raise ValueError(
            "n_select must be less than the number of observations.")
    if n_select <= 0:
        raise ValueError("n_select cannot be less than or equal to zero.")

    sf = fs_method(data, labels, n_select)
    reduced_set = []
    for k in range(len(sf)):
        reduced_set.append(features[int(sf[k])])
    return reduced_set
Example #12
0
def get_clusters_from_fasta_filepath(fasta_filepath,
                                     original_fasta_path,
                                     percent_ID=0.97,
                                     max_accepts=1,
                                     max_rejects=8,
                                     stepwords=8,
                                     word_length=8,
                                     optimal=False,
                                     exact=False,
                                     suppress_sort=False,
                                     output_dir=None,
                                     enable_rev_strand_matching=False,
                                     subject_fasta_filepath=None,
                                     suppress_new_clusters=False,
                                     return_cluster_maps=False,
                                     stable_sort=False,
                                     save_uc_files=True,
                                     HALT_EXEC=False):
    """ Main convenience wrapper for using uclust to generate cluster files
    
    A source fasta file is required for the fasta_filepath.  This will be 
    sorted to be in order of longest to shortest length sequences.  Following
    this, the sorted fasta file is used to generate a cluster file in the
    uclust (.uc) format.  Next the .uc file is converted to cd-hit format
    (.clstr).  Finally this file is parsed and returned as a list of lists, 
    where each sublist a cluster of sequences.  If an output_dir is
    specified, the intermediate files will be preserved, otherwise all
    files created are temporary and will be deleted at the end of this 
    function
    
    The percent_ID parameter specifies the percent identity for a clusters,
    i.e., if 99% were the parameter, all sequences that were 99% identical
    would be grouped as a cluster.
    """

    # Create readable intermediate filenames if they are to be kept

    fasta_output_filepath = None
    uc_output_filepath = None
    cd_hit_filepath = None

    if output_dir and not output_dir.endswith('/'):
        output_dir += '/'

    if save_uc_files:
        uc_save_filepath = get_output_filepaths(output_dir,
                                                original_fasta_path)
    else:
        uc_save_filepath = None

    sorted_fasta_filepath = ""
    uc_filepath = ""
    clstr_filepath = ""

    # Error check in case any app controller fails
    files_to_remove = []
    try:
        if not suppress_sort:
            # Sort fasta input file from largest to smallest sequence
            sort_fasta = uclust_fasta_sort_from_filepath(fasta_filepath, \
            output_filepath=fasta_output_filepath)

            # Get sorted fasta name from application wrapper
            sorted_fasta_filepath = sort_fasta['Output'].name
            files_to_remove.append(sorted_fasta_filepath)

        else:
            sort_fasta = None
            sorted_fasta_filepath = fasta_filepath

        # Generate uclust cluster file (.uc format)
        uclust_cluster = uclust_cluster_from_sorted_fasta_filepath(
            sorted_fasta_filepath,
            uc_save_filepath,
            percent_ID=percent_ID,
            max_accepts=max_accepts,
            max_rejects=max_rejects,
            stepwords=stepwords,
            word_length=word_length,
            optimal=optimal,
            exact=exact,
            suppress_sort=suppress_sort,
            enable_rev_strand_matching=enable_rev_strand_matching,
            subject_fasta_filepath=subject_fasta_filepath,
            suppress_new_clusters=suppress_new_clusters,
            stable_sort=stable_sort,
            HALT_EXEC=HALT_EXEC)
        # Get cluster file name from application wrapper
        remove_files(files_to_remove)
    except ApplicationError:
        remove_files(files_to_remove)
        raise ApplicationError, (
            'Error running uclust. Possible causes are '
            'unsupported version (current supported version is v1.2.22) is installed or '
            'improperly formatted input file was provided')
    except ApplicationNotFoundError:
        remove_files(files_to_remove)
        raise ApplicationNotFoundError('uclust not found, is it properly '+\
         'installed?')

    # Get list of lists for each cluster
    clusters, failures, seeds = \
     clusters_from_uc_file(uclust_cluster['ClusterFile'])

    # Remove temp files unless user specifies output filepath
    if not save_uc_files:
        uclust_cluster.cleanUp()

    if return_cluster_maps:
        return clusters, failures, seeds
    else:
        return clusters.values(), failures, seeds
Example #13
0
def assign_taxonomy(dataPath, reference_sequences_fp, id_to_taxonomy_fp, read_1_seqs_fp, read_2_seqs_fp, single_ok=False, no_single_ok_generic=False,
                    header_id_regex=None, read_id_regex = "\S+\s+(\S+)", amplicon_id_regex = "(\S+)\s+(\S+?)\/",
                    output_fp=None, log_path=None, HALT_EXEC=False, base_tmp_dir = '/tmp'):
    """Assign taxonomy to each sequence in data with the RTAX classifier

        # data: open fasta file object or list of fasta lines
        dataPath: path to a fasta file

        output_fp: path to write output; if not provided, result will be
         returned in a dict of {seq_id:(taxonomy_assignment,confidence)}
    """

    usearch_command = "usearch"
    if not (exists(usearch_command) or app_path(usearch_command)):
        raise ApplicationNotFoundError("Cannot find %s. Is it installed? Is it in your path?"\
         % usearch_command)

    my_tmp_dir = get_tmp_filename(tmp_dir=base_tmp_dir,prefix='rtax_',suffix='',result_constructor=str)
    os.makedirs(my_tmp_dir)


    try:
        # RTAX classifier doesn't necessarily preserve identifiers
        # it reports back only the id extracted as $1 using header_id_regex
        # since rtax takes the original unclustered sequence files as input,
        # the usual case is that the regex extracts the amplicon ID from the second field



        # Use lookup table
        read_1_id_to_orig_id = {}
        readIdExtractor = re.compile(read_id_regex)  # OTU clustering produces ">clusterID read_1_id"
        data = open(dataPath,'r')
        for seq_id, seq in MinimalFastaParser(data):
            # apply the regex
            extract = readIdExtractor.match(seq_id)
            if extract is None:
                stderr.write("Matched no ID with read_id_regex " + read_id_regex +" in '" + seq_id + "' from file " + dataPath + "\n")
            else:
                read_1_id_to_orig_id[extract.group(1)] = seq_id
                #stderr.write(extract.group(1) + " => " +  seq_id + "\n")
            #seq_id_lookup[seq_id.split()[1]] = seq_id
        data.close()



        # make list of amplicon IDs to pass to RTAX

        id_list_fp = open(my_tmp_dir+"/ampliconIdsToClassify", "w")

        # Establish mapping of amplicon IDs to read_1 IDs
        # simultaneously write the amplicon ID file for those IDs found in the input mapping above

        amplicon_to_read_1_id = {}
        ampliconIdExtractor = re.compile(amplicon_id_regex)  # split_libraries produces >read_1_id ampliconID/1 ...  // see also assign_taxonomy 631
        read_1_data = open(read_1_seqs_fp,'r')
        for seq_id, seq in MinimalFastaParser(read_1_data):
            # apply the regex
            extract = ampliconIdExtractor.match(seq_id)
            if extract is None:
                stderr.write("Matched no ID with amplicon_id_regex " + amplicon_id_regex + " in '" + seq_id + "' from file " + read_1_seqs_fp + "\n")
            else:
                read_1_id = extract.group(1)
                amplicon_id = extract.group(2)
                try:
                    amplicon_to_read_1_id[amplicon_id] = read_1_id
                    bogus = read_1_id_to_orig_id[read_1_id]  # verify that the id is valid
                    id_list_fp.write('%s\n' % (amplicon_id))
                except KeyError:
                    pass
        data.close()
        id_list_fp.close()

        app = Rtax(HALT_EXEC=HALT_EXEC)

        temp_output_file = tempfile.NamedTemporaryFile(
            prefix='RtaxAssignments_', suffix='.txt')
        app.Parameters['-o'].on(temp_output_file.name)
        app.Parameters['-r'].on(reference_sequences_fp)
        app.Parameters['-t'].on(id_to_taxonomy_fp)
        # app.Parameters['-d'].on(delimiter)
        app.Parameters['-l'].on(id_list_fp.name)  # these are amplicon IDs
        app.Parameters['-a'].on(read_1_seqs_fp)
        if read_2_seqs_fp is not None:
            app.Parameters['-b'].on(read_2_seqs_fp)
        app.Parameters['-i'].on(header_id_regex)
        app.Parameters['-m'].on(my_tmp_dir)
        if single_ok: app.Parameters['-f'].on();
        if no_single_ok_generic: app.Parameters['-g'].on();
        #app.Parameters['-v'].on()

        app_result = app()

        if log_path:
            f=open(log_path, 'a')
            errString=''.join(app_result['StdErr'].readlines()) + '\n'
            f.write(errString)
            f.close()

        assignments = {}

        # restore original sequence IDs with spaces

        for line in app_result['Assignments']:
            toks = line.strip().split('\t')
            rtax_id = toks.pop(0)
            if len(toks):
                bestpcid = toks.pop(0)  # ignored
            lineage = toks

            # RTAX does not provide a measure of confidence.  We could pass one in,
            # based on the choice of primers, or even look it up on the fly in the tables
            # from the "optimal primers" paper; but it would be the same for every
            # query sequence anyway.
            # we could also return bestpcid, but that's not the same thing as confidence.
            confidence = 1.0

            read_1_id = amplicon_to_read_1_id[rtax_id]
            orig_id = read_1_id_to_orig_id[read_1_id]
            if lineage:
                assignments[orig_id] = (';'.join(lineage), confidence)
            else:
                assignments[orig_id] = ('Unclassified', 1.0)

        if output_fp:
            try:
                output_file = open(output_fp, 'w')
            except OSError:
                raise OSError("Can't open output file for writing: %s" % output_fp)
            for seq_id, assignment in list(assignments.items()):
                lineage, confidence = assignment
                output_file.write(
                    '%s\t%s\t%1.3f\n' % (seq_id, lineage, confidence))
            output_file.close()
            return None
        else:
            return assignments
    finally:
        try:
            rmtree(my_tmp_dir)
        except OSError:
            pass
Example #14
0
 def raise_tax2tree_not_found_error(*args, **kwargs):
     raise ApplicationNotFoundError(
         "Tax2Tree cannot be found.\nIs Tax2Tree installed? Is it in your $PYTHONPATH?"
         +
         "\nYou can obtain Tax2Tree from http://sourceforge.net/projects/tax2tree/."
     )
Example #15
0
def check_sfffile():
    """Raise error if sfffile is not in $PATH """
    if not app_path('sfffile'):
        raise ApplicationNotFoundError(_MISSING_APP_MESSAGE % 'sfffile')
Example #16
0
def check_sffinfo():
    """Raise error if sffinfo is not in $PATH """
    if not which('sffinfo'):
        raise ApplicationNotFoundError(_MISSING_APP_MESSAGE % 'sffinfo')
Example #17
0
 def raise_pynast_not_found_error(*args, **kwargs):
     raise ApplicationNotFoundError(
         "PyNAST cannot be found.\nIs PyNAST installed? Is it in your $PYTHONPATH?"
         + "\nYou can obtain PyNAST from http://qiime.org/pynast/.")
Example #18
0
__credits__ = ["Kyle Patnode", "Jai Ram Rideout", "Antonio Gonzalez Pena"]
__license__ = "GPL"
__version__ = "1.5.3-dev"
__maintainer__ = "Kyle Patnode"
__email__ = "*****@*****.**"
"""Test suite for the generate_taxa_compare_table.py module.

Tests each function in the tax2tree controller module. It
should be noted that these tests are fairly sparse, since
tax2tree implements quite a few of its own tests."""

from cogent.app.util import ApplicationNotFoundError
try:
    from t2t.nlevel import load_tree, load_consensus_map, determine_rank_order
except ImportError:
    raise ApplicationNotFoundError(
        "Cannot find tax2tree. Is it installed? Is it in your path?")
from os import makedirs, getcwd, chdir
from os.path import exists
from shutil import rmtree
from tempfile import mkdtemp
from cogent.util.unit_test import TestCase, main
from cogent.util.misc import remove_files
from qiime.test import initiate_timeout, disable_timeout
from qiime.util import get_qiime_temp_dir
from qiime.pycogent_backports.tax2tree import *


class GenerateTaxaCompareTableTests(TestCase):
    """Tests for the tax2tree_controller.py module."""
    def setUp(self):
        """Set up files/environment that will be used by the tests."""
Example #19
0
 def _error_on_missing_application(self, params):
     """ Raise an ApplicationNotFoundError if the app is not accessible
     """
     if not app_path('blastall'):
         raise ApplicationNotFoundError(
             "Cannot find blastall. Is it installed? Is it in your path?")