Example #1
0
class TaxaClassifier(object):
    """Can assign taxonomy to a FASTA file of random shotgun sequences."""

    all_paths = """
    /tree.txt
    /annot.txt
    /graph.xml
    /graph.png
    """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)

    def __init__(self):
        # Files #
        self.taxa_file = FilePath(self.base_dir + self.sample.name + '.txt')
        self.class_out_file = FilePath(self.base_dir + self.sample.type + '.txt')
        self.class_out_file.link_from(self.taxa_file, safe=True)
        # NMDS #
        self.coord = (0,0)

    def make_graph(self):
        # Convert #
        graphconv = sh.Command(home + "share/metaphlan/plotting_scripts/metaphlan2graphlan.py")
        graphconv(self.taxa_file, '--tree_file', self.p.tree, '--annot_file', self.p.annot)
        # Annotate #
        annotate = sh.Command(home + "share/graphlan/graphlan_annotate.py")
        annotate('--annot', self.p.annot, self.p.tree, self.p.xml)
        # Graph #
        graphlan = sh.Command(home + "share/graphlan/graphlan.py")
        graphlan('--dpi', 200, self.p.xml, self.p.png)
Example #2
0
 def overwrite_cache(self, value):
     # Where should we look in the file system ? #
     if 'cache_dir' in self.__dict__:
         path = FilePath(self.__dict__['cache_dir'] + f.func_name + '.pickle')
     else:
         path = getattr(self.p, f.func_name)
     if value is None: path.remove()
     else: raise Exception("You can't set a pickled property, you can only delete it")
Example #3
0
class Project(Aggregate):
    """A project containing several samples. You can describe your
    projects in the a JSON placed in the repository."""

    all_paths = Aggregate.all_paths + """
    /info.json
    /samples/
    """

    def __repr__(self): return '<%s object "%s" with %i samples>' % \
                               (self.__class__.__name__, self.name, len(self))

    def __init__(self, json_path, project_dir):
        # Parse the json file describing the project #
        self.json_path = FilePath(json_path)
        with open(json_path) as handle: self.info = json.load(handle)
        # Required parameters #
        self.num       = self.info['project_num']
        self.name      = self.info['project_name']
        self.long_name = self.info['project_long_name']
        # Optional parameters #
        self.abstract  = self.info.get('abstract')
        self.run_name  = self.info.get('illumina_run_id')
        self.account   = self.info.get('uppmax_project_id')
        # Base directory #
        self.base_dir = project_dir + self.name + '/'
        # Delayed init #
        self.loaded = False

    def load(self):
        """A delayed kind of __init__ that is not called right away to avoid
        crowding the RAM of the python interpreter when you just import gefes"""
        # Load #
        self.loaded = True
        # Automatic paths #
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Make an alias to the json #
        self.json_path.link_to(self.p.info_json, safe=True)
        # Make all the samples object that this project possesses #
        if self.info.get('auto_parse_samples'):
            search_dir = self.info['samples_base_dir']
            search_dir = os.path.expanduser(search_dir)
            paths = glob.glob(search_dir + '*.fastq*')
            if not paths: paths = glob.glob(search_dir + '*.fasta*')
            if not paths: raise Exception("Found no FASTA or FASTQ path in %s" % search_dir)
            if all([re.search("_R[12]_", p) for p in paths]): pairs = join_paired_filepaths(paths)
            else:                                             pairs = sort_string_by_pairs(paths)
            self.samples = [Sample(self, p[0], p[1], num=i) for i,p in enumerate(pairs)]
            self.samples.sort(key=lambda x: natural_sort(x.name))
        else:
            self.samples = [Sample(self, info=info) for info in self.info['samples']]
        # The samples of a project are it's children in a way #
        self.children = self.samples
        # Call the mother function #
        return Aggregate.load(self)
Example #4
0
 def __init__(self):
     # Files #
     self.taxa_file = FilePath(self.base_dir + self.sample.name + '.txt')
     self.class_out_file = FilePath(self.base_dir + self.sample.type + '.txt')
     self.class_out_file.link_from(self.taxa_file, safe=True)
     # NMDS #
     self.coord = (0,0)
Example #5
0
 def set_paths(self, base_dir, script_path):
     """Set the directory, the script path and the outfile path"""
     # Make absolute paths #
     if 'change_dir' in self.kwargs:
         self.kwargs['change_dir'] = DirectoryPath(os.path.abspath(self.kwargs['change_dir']))
     if 'out_file' in self.kwargs:
         self.kwargs['out_file']   = FilePath(os.path.abspath(self.kwargs['out_file']))
     # In case there is a base directory #
     if base_dir is not None:
         self.base_dir             = DirectoryPath(os.path.abspath(base_dir))
         self.script_path          = FilePath(base_dir + "run." + self.extensions[self.language])
         self.kwargs['change_dir'] = base_dir
         self.kwargs['out_file']   = FilePath(base_dir + "run.out")
     # Other cases #
     if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path())
     if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path))
Example #6
0
 def build_tree_raxml(self,
                      new_path=None,
                      seq_type='nucl' or 'prot',
                      num_threads=None,
                      free_cores=2,
                      keep_dir=False):
     """Make a tree with RAxML."""
     # Check output #
     if new_path is None: new_path = self.prefix_path + '.tree'
     # What model to choose #
     if seq_type == 'nucl': model = "GTRGAMMA"
     if seq_type == 'prot': model = "PROTGAMMAJTTF"
     # Threads #
     if num_threads is None:
         num_threads = multiprocessing.cpu_count() - free_cores
     else:
         num_threads = int(num_threads) - free_cores
     num_threads = max(1, num_threads)
     # Run it #
     temp_dir = new_temp_dir()
     sh.raxml811('-m', model, "-T", num_threads, '-p', 1, '-s', self.path,
                 '-n', 'tree', '-w', temp_dir, '-f', 'a', '-x', 1, '-N',
                 'autoMR')
     # Move into place #
     if keep_dir:
         shutil.rmtree(new_path)
         shutil.move(temp_dir, new_path)
     if not keep_dir:
         shutil.move(temp_dir + 'RAxML_bestTree.tree', new_path)
     # Return #
     return FilePath(new_path)
Example #7
0
 def __init__(self, base_dir=None):
     # Base directory #
     if base_dir is None: base_dir = home
     self.base_dir = base_dir + 'databases/' + self.short_name + '/'
     self.p        = AutoPaths(self.base_dir, self.all_paths)
     # The results #
     self.alignment = FASTA(self.p.mothur_fasta)
     self.taxonomy  = FilePath(self.p.mothur_tax)
     # The part that mothur will use for naming files #
     self.nickname = "foram_mothur"
Example #8
0
 def build_tree_fast(self, new_path=None, seq_type='nucl' or 'prot'):
     """Make a tree with FastTree. Names will be truncated however."""
     # Check output #
     if new_path is None: new_path = self.prefix_path + '.tree'
     # Command #
     command_args = []
     if seq_type == 'nucl': command_args += ['-nt']
     command_args += ['-gamma']
     command_args += ['-out', new_path]
     command_args += [self.path]
     # Run it #
     sh.FastTree(*command_args)
     # Return #
     return FilePath(new_path)
Example #9
0
 def __init__(self, json_path, project_dir):
     # Parse the json file describing the project #
     self.json_path = FilePath(json_path)
     with open(json_path) as handle: self.info = json.load(handle)
     # Required parameters #
     self.num       = self.info['project_num']
     self.name      = self.info['project_name']
     self.long_name = self.info['project_long_name']
     # Optional parameters #
     self.abstract  = self.info.get('abstract')
     self.run_name  = self.info.get('illumina_run_id')
     self.account   = self.info.get('uppmax_project_id')
     # Base directory #
     self.base_dir = project_dir + self.name + '/'
     # Delayed init #
     self.loaded = False
Example #10
0
 def tree(self):
     """The path to the tree built with raxml"""
     tree = FilePath(self.p.tree_dir + 'RAxML_bestTree.tree')
     if not tree.exists:
         # Check we can do it #
         if self.gaps_in_alignment:
             message = "Can't build a tree for cluster %i because of gaps. Skipping."
             warnings.warn(message % self.num)
             return None
         # Do it #
         self.alignment.build_tree(new_path=self.p.tree_dir,
                                   seq_type=self.analysis.seq_type,
                                   num_threads=self.analysis.num_threads,
                                   free_cores=0,
                                   keep_dir=True)
     return tree
Example #11
0
File: job.py Project: DC23/plumbing
 def set_paths(self, base_dir, script_path):
     """Set the directory, the script path and the outfile path"""
     # Make absolute paths #
     if 'change_dir' in self.kwargs:
         self.kwargs['change_dir'] = DirectoryPath(
             os.path.abspath(self.kwargs['change_dir']))
     if 'out_file' in self.kwargs:
         self.kwargs['out_file'] = FilePath(
             os.path.abspath(self.kwargs['out_file']))
     # In case there is a base directory #
     if base_dir is not None:
         self.base_dir = DirectoryPath(os.path.abspath(base_dir))
         self.script_path = FilePath(base_dir + "run." +
                                     self.extensions[self.language])
         self.kwargs['change_dir'] = base_dir
         self.kwargs['out_file'] = FilePath(base_dir + "run.out")
     # Other cases #
     if base_dir is None and script_path is None:
         self.script_path = FilePath(new_temp_path())
     if script_path is not None:
         self.script_path = FilePath(os.path.abspath(script_path))
Example #12
0
 def __init__(self, path, parent):
     # Save parent #
     self.parent, self.pool = parent, parent
     self.samples = parent.samples
     # Auto paths #
     self.base_dir = parent.p.quality_dir + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Files #
     self.untrimmed = BarcodedFASTQ(path, samples=self.samples)
     self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples)
     self.trimmed = FASTA(self.p.trimmed)
     # Qiime output #
     self.qiime_fasta = FASTA(self.p.qiime_fasta)
     # Mothur #
     self.mothur_fasta = FASTA(self.p.mothur_fasta)
     self.mothur_qual = QualFile(self.p.mothur_qual)
     self.mothur_groups = FilePath(self.p.mothur_groups)
     # Primer size #
     self.trim_fwd = self.pool.samples.trim_fwd
     self.trim_rev = self.pool.samples.trim_rev
Example #13
0
 def __init__(self, cluster):
     # Save parent #
     self.cluster, self.parent = cluster, cluster
     # Inherited #
     self.samples = self.parent.samples
     # Paths #
     self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Main FASTA file #
     self.reads = self.parent.reads
     # Files #
     self.all_otus = FilePath(self.p.all_otus)
     self.all_centers = FASTA(self.p.all_centers)
     self.otus = FilePath(self.base_dir + "otus.txt")
     self.centers = FASTA(self.base_dir + "centers.fasta")
     # Taxonomy #
     self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva)
     self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
     # Preferred one #
     self.taxonomy = self.taxonomy_silva
Example #14
0
 def retrieve_from_cache(self):
     # Is it in the cache ? #
     if '__cache__' not in self.__dict__: self.__cache__ = {}
     if f.__name__ in self.__cache__: return self.__cache__[f.__name__]
     # Where should we look in the file system ? #
     if 'cache_dir' in self.__dict__:
         path = FilePath(self.__dict__['cache_dir'] + f.func_name +
                         '.pickle')
     else:
         path = getattr(self.p, f.func_name)
     # Is it on disk ? #
     if path.exists:
         with open(path) as handle:
             result = pickle.load(handle)
         self.__cache__[f.__name__] = result
         return result
     # Otherwise let's compute it #
     result = f(self)
     with open(path, 'w') as handle:
         pickle.dump(result, handle)
     self.__cache__[f.__name__] = result
     return result
Example #15
0
class Foraminifera(Database):
    """This is a custom database containing exlcusively Foraminifera sequences.

    https://genev.unige.ch/research/laboratory/Jan-Pawlowski

    You should place the file "foram_db_cor.fasta" in:  ~/databases/foraminifera/
    Then you can run this:
    
            from seqsearch.databases.foraminifera import foraminifera
            foraminifera.process()
            print foraminifera.tax_depth_freq

    """

    short_name = "foraminifera"
    long_name  = 'The custom made Foraminifera database as received by email on 7th April 2017'

    all_paths = """
    /foram_db_cor.fasta
    /foram_mothur.fasta
    /foram_mothur.tax
    """

    @property
    def rank_names(self):
        """The names of the ranks. Total 9 ranks."""
        return ['Domain',   # 0
                'Kingdom',  # 1
                'Phylum',   # 2
                'Class',    # 3
                'Order',    # 4
                'Family',   # 5
                'Tribe',    # 6
                'Genus',    # 7
                'Species']  # 8

    def __init__(self, base_dir=None):
        # Base directory #
        if base_dir is None: base_dir = home
        self.base_dir = base_dir + 'databases/' + self.short_name + '/'
        self.p        = AutoPaths(self.base_dir, self.all_paths)
        # The results #
        self.alignment = FASTA(self.p.mothur_fasta)
        self.taxonomy  = FilePath(self.p.mothur_tax)
        # The part that mothur will use for naming files #
        self.nickname = "foram_mothur"

    def process(self):
        # The file that was received by email without documentation T_T #
        raw = FASTA(self.p.cor)
        # Open files #
        self.alignment.create()
        self.taxonomy.create()
        # Loop #
        for seq in raw:
            # Parse #
            name = seq.id[11:].split('|')
            num  = name.pop(0)
            # Check #
            for x in name: assert ';' not in x
            for x in name: assert '\t' not in x
            # Make ranks #
            ranks = ['Eukaryota'                       , # 0 Domain
                     'Rhizaria'                        , # 1 Kingdom
                     'Foraminifera'                    , # 2 Phylum
                     name[0]                           , # 3 Class
                     name[1]                           , # 4 Order
                     name[2]                           , # 5 Family
                     name[3]                           , # 6 Tribe
                     name[4]                           , # 7 Genus
                     name[5]]                            # 8 Species
            # The taxonomy string #
            tax_line = ';'.join(ranks)
            # Add sequence to the new fasta file #
            self.alignment.add_str(str(seq.seq), name="foram" + num)
            # Add the taxonomy to the tax file #
            self.taxonomy.add_str("foram" + num + '\t' + tax_line + '\n')
        # Close files #
        self.alignment.close()
        self.taxonomy.close()
Example #16
0
File: sample.py Project: mtop/gefes
 def web_export(self):
     """Copy the report to the webexport directory where it can be viewed by anyone"""
     dest = FilePath(("/proj/%s/webexport/" + self.location) % self.sample.account)
     dest.make_directory()
     shutil.copy(self.output_path, dest)
Example #17
0
File: job.py Project: DC23/plumbing
class JobSLURM(object):
    """Makes launching SLURM jobs easy to write and easy to use. Here are some
    examples on how to use this class:

        for command in ['print "hi"', 'print "hello"']:
            job = JobSLURM(command, time='00:01:00', qos='short')
            job.run()

        for path in ['~/data/scafolds1.txt', '~/data/scafolds2.txt', '~/data/scafolds3.txt']:
            command =  ['import sh\n']
            command += ['script = sh.Command("analyze.py")\n']
            command += ['script(%s)' % path]
            job = JobSLURM(command, time='00:01:00', qos='short', job_name=path[-25:])
            job.run()
            print "Job %i is running !" % job.id

    Then you can easily check the status of your job and the standard out that is associated:

        print job.status
        print job.log_tail
        print job.info['time_left']

    etc.
    """

    extensions = {'bash': "sh", 'python': "py"}

    shebang_headers = {
        'bash': ["#!/bin/bash -le"],  # As a login shell and stop on error
        'python': ["#!/usr/bin/env python"]
    }

    slurm_headers = OrderedDict((
        ('job_name', {
            'tag': '#SBATCH -J %s',
            'needed': True
        }),
        ('change_dir', {
            'tag': '#SBATCH -D %s',
            'needed': True,
            'default': os.path.abspath(os.getcwd())
        }),
        ('out_file', {
            'tag': '#SBATCH -o %s',
            'needed': True,
            'default': '/dev/null'
        }),
        ('project', {
            'tag': '#SBATCH -A %s',
            'needed': False,
            'default': 'b2011035'
        }),
        ('time', {
            'tag': '#SBATCH -t %s',
            'needed': True,
            'default': '7-00:00:00'
        }),
        ('machines', {
            'tag': '#SBATCH -N %s',
            'needed': True,
            'default': '1'
        }),
        ('cores', {
            'tag': '#SBATCH -n %s',
            'needed': True,
            'default': num_processors
        }),
        ('partition', {
            'tag': '#SBATCH -p %s',
            'needed': True,
            'default': 'node'
        }),
        ('email', {
            'tag': '#SBATCH --mail-user %s',
            'needed': False,
            'default': os.environ.get('EMAIL')
        }),
        ('email-when', {
            'tag': '#SBATCH --mail-type=%s',
            'needed': True,
            'default': 'END'
        }),
        ('qos', {
            'tag': '#SBATCH --qos=%s',
            'needed': False,
            'default': 'short'
        }),
        ('dependency', {
            'tag': '#SBATCH -d %s',
            'needed': False,
            'default': 'afterok:1'
        }),
        ('constraint', {
            'tag': '#SBATCH -C %s',
            'needed': False,
            'default': 'fat'
        }),
        ('cluster', {
            'tag': '#SBATCH -M %s',
            'needed': False,
            'default': 'milou'
        }),
        ('alloc', {
            'tag': '#SBATCH --reservation=%s',
            'needed': False,
            'default': 'workstation'
        }),
        ('jobid', {
            'tag': '#SBATCH --jobid=%i',
            'needed': False,
            'default': 2173455
        }),
        ('memory', {
            'tag': '#SBATCH --mem=%i',
            'needed': False,
            'default': 120000
        }),
        ('mem_per_cpu', {
            'tag': '#SBATCH --mem-per-cpu=%i',
            'needed': False,
            'default': 512
        }),
        ('threads', {
            'tag': '#SBATCH --cpus-per-task=%i',
            'needed': False,
            'default': num_processors
        }),
    ))

    script_headers = {
        'bash': ['echo "SLURM: start at $(date) on $(hostname)"'],
        'python': [
            'import dateutil.tz, datetime, platform',
            'now = datetime.datetime.now(dateutil.tz.tzlocal())',
            r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")',
            'node = platform.node()',
            'print "SLURM: start at {0} on {1}".format(now, node)'
        ]
    }

    script_footers = {
        'bash': ['echo "SLURM: end at $(date)"'],
        'python': [
            'now = datetime.datetime.now(dateutil.tz.tzlocal())',
            r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")',
            'print "SLURM: end at {0}".format(now)'
        ]
    }

    def __repr__(self):
        return '<%s object "%s">' % (self.__class__.__name__, self.name)

    @property
    def name(self):
        return self.kwargs['job_name']

    def __init__(self,
                 command=["print 'Hello world'"],
                 language='python',
                 base_dir=None,
                 script_path=None,
                 **kwargs):
        # Required attributes #
        self.command = command
        self.language = language
        self.kwargs = kwargs
        # Set the file paths #
        self.set_paths(base_dir, script_path)
        # Check command type #
        if not isinstance(self.command, list): self.command = [self.command]
        # Get the name #
        if 'job_name' not in self.kwargs:
            hashed = hashlib.md5(''.join(self.command)).digest()
            encoded = base64.urlsafe_b64encode(hashed)
            self.kwargs['job_name'] = encoded
        # Check we have a project otherwise choose the one with less hours #
        if hostname.startswith('milou'):
            if 'project' not in self.kwargs and 'SBATCH_ACCOUNT' not in os.environ:
                if projects: self.kwargs['project'] = projects[0]['name']

    def set_paths(self, base_dir, script_path):
        """Set the directory, the script path and the outfile path"""
        # Make absolute paths #
        if 'change_dir' in self.kwargs:
            self.kwargs['change_dir'] = DirectoryPath(
                os.path.abspath(self.kwargs['change_dir']))
        if 'out_file' in self.kwargs:
            self.kwargs['out_file'] = FilePath(
                os.path.abspath(self.kwargs['out_file']))
        # In case there is a base directory #
        if base_dir is not None:
            self.base_dir = DirectoryPath(os.path.abspath(base_dir))
            self.script_path = FilePath(base_dir + "run." +
                                        self.extensions[self.language])
            self.kwargs['change_dir'] = base_dir
            self.kwargs['out_file'] = FilePath(base_dir + "run.out")
        # Other cases #
        if base_dir is None and script_path is None:
            self.script_path = FilePath(new_temp_path())
        if script_path is not None:
            self.script_path = FilePath(os.path.abspath(script_path))

    @property_cached
    def slurm_params(self):
        """The list of parameters to give to the `sbatch` command."""
        # Main loop #
        result = OrderedDict()
        for param, info in self.slurm_headers.items():
            if not info['needed'] and not param in self.kwargs: continue
            if param in self.kwargs: result[param] = self.kwargs.get(param)
            else: result[param] = info['default']
        # Special cases #
        if result.get('cluster') == 'halvan': result['partition'] = 'halvan'
        # Return #
        return result

    @property
    def script(self):
        """The script to be submitted to the SLURM queue."""
        self.shebang_header = self.shebang_headers[self.language]
        self.slurm_header = [
            self.slurm_headers[k]['tag'] % v
            for k, v in self.slurm_params.items()
        ]
        self.script_header = self.script_headers[self.language]
        self.script_footer = self.script_footers[self.language]
        return '\n'.join(
            flatter([
                self.shebang_header, self.slurm_header, self.script_header,
                self.command, self.script_footer
            ]))

    def make_script(self):
        """Make the script and return a FilePath object pointing to the script above."""
        self.script_path.write(self.script)
        self.script_path.permissions.make_executable()
        return self.script_path

    @property
    def log(self):
        """The log as a FilePath object"""
        return self.slurm_params['out_file']

    @property
    def log_tail(self):
        """If we have a log file, what is its tail"""
        if not self.kwargs['out_file'].exists: return False
        else: return tail(self.slurm_params['out_file'])

    @property
    def status(self):
        """What is the status of the job ?"""
        # If there is no script it is either ready or a lost duplicate #
        if not self.script_path.exists:
            if self.name in jobs.names: return "DUPLICATE"
            if self.name not in jobs.names: return "READY"
        # It is submitted already #
        if self.name in jobs.names:
            if jobs[self.name]['type'] == 'queued': return "QUEUED"
            if jobs[self.name]['type'] == 'running': return "RUNNING"
        # So the script exists for sure but it is not in the queue #
        if not self.kwargs['out_file'].exists: return "ABORTED"
        # Let's look in log file #
        if 'CANCELED' in self.log_tail: return "CANCELLED"
        if 'slurmstepd: error' in self.log_tail: return "CANCELLED"
        # It all looks good #
        if 'SLURM: end at' in self.log_tail: return "FINISHED"
        # At this point we have no idea #
        return "INTERUPTED"

    @property
    def info(self):
        """Get the existing job information dictionary"""
        if self.name not in jobs: return {'status': self.status}
        else: return jobs[self.name]

    #-------------------------------------------------------------------------#
    def run(self):
        """Will call self.launch() after performing some checks"""
        # Check already exists #
        if self.status == "READY": return self.launch()
        # Check name conflict #
        if self.status == "DUPLICATE":
            message = "Job with same name '%s' already in queue, but we lost the script."
        if self.status == "QUEUED": message = "Job '%s' already in queue."
        if self.status == "RUNNING": message = "Job '%s' already running."
        if self.status == "FINISHED":
            message = "Job '%s' already ended successfully."
        if self.status == "ABORTED":
            message = "Job '%s' was killed without any output file (?)."
        if self.status == "CANCELED":
            message = "Job '%s' was canceled or killed while running."
        if self.status == "INTERUPTED":
            message = "Job '%s' is not running. We don't know why. Look at the log file."
        print Color.i_red + message % (self.name, ) + Color.end
        print "Job might have run already (?). Not starting."

    def launch(self):
        """Make the script file and return the newly created job id"""
        # Make script file #
        self.make_script()
        # Do it #
        sbatch_out = sh.sbatch(self.script_path)
        jobs.expire()
        # Message #
        print Color.i_blu + "SLURM:" + Color.end + " " + str(sbatch_out),
        # Return id #
        self.id = int(
            re.findall("Submitted batch job ([0-9]+)", str(sbatch_out))[0])
        return self.id

    def cancel(self):
        if self.status != "QUEUED" and self.status != "RUNNING":
            raise Exception("Can't cancel job '%s'" % self.name)
        sh.scancel(self.info['jobid'])

    def wait(self):
        """Wait until the job is finished"""
        pass

    #-------------------------------------------------------------------------#
    def run_locally(self):
        """A convenience method to run the same result as a SLURM job
        but locally in a non-blocking way. Useful for testing."""
        self.thread = threading.Thread(target=self.execute_locally)
        self.thread.daemon = True  # So that they die when we die
        self.thread.start()

    def execute_locally(self):
        """Runs the equivalent command locally in a blocking way."""
        # Make script file #
        self.make_script()
        # Do it #
        with open(self.kwargs['out_file'], 'w') as handle:
            sh.python(self.script_path, _out=handle, _err=handle)

    def wait_locally(self):
        """If you have run the query in a non-blocking way, call this method to pause
        until the query is finished."""
        try:
            self.thread.join(
                sys.maxint)  # maxint timeout so that we can Ctrl-C them
        except KeyboardInterrupt:
            print "Stopped waiting on job '%s'" % self.kwargs['job_name']
Example #18
0
class QualityReads(object):
    """A set of sequences determined to be quality controlled"""

    all_paths = """
    /mothur_reads.fasta
    /mothur_reads.qual
    /mothur_groups.tsv
    /qiime_reads.fasta
    /only_used_samples.fasta
    /trimmed.fasta
    """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)
    def __len__(self): return len(self.trimmed)

    def __init__(self, path, parent):
        # Save parent #
        self.parent, self.pool = parent, parent
        self.samples = parent.samples
        # Auto paths #
        self.base_dir = parent.p.quality_dir + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Files #
        self.untrimmed = BarcodedFASTQ(path, samples=self.samples)
        self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples)
        self.trimmed = FASTA(self.p.trimmed)
        # Qiime output #
        self.qiime_fasta = FASTA(self.p.qiime_fasta)
        # Mothur #
        self.mothur_fasta = FASTA(self.p.mothur_fasta)
        self.mothur_qual = QualFile(self.p.mothur_qual)
        self.mothur_groups = FilePath(self.p.mothur_groups)
        # Primer size #
        self.trim_fwd = self.pool.samples.trim_fwd
        self.trim_rev = self.pool.samples.trim_rev

    def filter_unused(self):
        def no_unused_iterator(reads):
            for r in reads.parse_barcodes():
                if r.first.sample.used: yield r.read
        self.only_used.write(no_unused_iterator(self.untrimmed))

    def trim_primers(self):
        def no_primers_iterator(reads):
            for read in reads:
                yield read[self.trim_fwd:-self.trim_rev]
        self.trimmed.write(no_primers_iterator(self.only_used))

    def make_mothur_output(self):
        # Trimmed fasta #
        self.mothur_fasta.link_from(self.trimmed.path)
        # The groups file #
        self.mothur_groups.create()
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            read_name = '%s\t%s\n' % (r.read.id, sample_name)
            self.mothur_groups.handle.write(read_name)
        self.mothur_groups.close()

    def make_qiime_output(self):
        # Prepare fasta writer #
        handle = open(self.qiime_fasta.path, 'w')
        writer = FastaWriter(handle, wrap=0)
        writer.write_header()
        # Counter #
        counter = defaultdict(int)
        # Do it #
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            counter[sample_name] += 1
            r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id)
            bar_seq = r.read.seq[0:self.pool.bar_len]
            r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq)
            writer.write_record(r.read[self.trim_fwd:-self.trim_rev])
        # Close #
        writer.write_footer()
        handle.close()
Example #19
0
 def __init__(self, parent):
     self.parent = parent
     self.path = FilePath(self.parent.prefix_path + '_len_hist.pdf')
Example #20
0
class UclustOTUs(OTUs):
    """Will use uclust via the qimme wraper to create OTU clusters from a given FASTA file
    http://qiime.org/scripts/pick_otus.html"""

    short_name = 'uclust'
    title = 'UCLUST-QIIME denovo picking'

    all_paths = """
    /clusters/clusters.uc
    /clusters/qiime.log
    /clusters/all_otus.txt
    /clusters/all_centers.fasta
    /centers.fasta
    /otus.txt
    /taxonomy_silva/
    /taxonomy_fw/
    /graphs/
    """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)

    def __init__(self, cluster):
        # Save parent #
        self.cluster, self.parent = cluster, cluster
        # Inherited #
        self.samples = self.parent.samples
        # Paths #
        self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Main FASTA file #
        self.reads = self.parent.reads
        # Files #
        self.all_otus = FilePath(self.p.all_otus)
        self.all_centers = FASTA(self.p.all_centers)
        self.otus = FilePath(self.base_dir + "otus.txt")
        self.centers = FASTA(self.base_dir + "centers.fasta")
        # Taxonomy #
        self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva)
        self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
        # Preferred one #
        self.taxonomy = self.taxonomy_silva

    def run(self):
        # Clean #
        shutil.rmtree(self.p.clusters_dir)
        # Run command #
        pick_otus = sh.Command('pick_otus.py')
        pick_otus('-m', 'uclust', '-i', self.reads, '-o', self.p.clusters_dir)
        # Move into place #
        base_name = self.p.clusters_dir + self.reads.prefix
        shutil.move(base_name + '_otus.txt', self.all_otus)
        shutil.move(base_name + '_otus.log', self.p.qiime_log)
        shutil.move(base_name + '_clusters.uc', self.p.clusters_uc)
        # Remove OTUs that are only one read #
        def filter_singletons(f):
            for line in f:
                line = line.split()
                if len(line) > 2: yield '\t'.join(line) + '\n'
        self.otus.writelines(filter_singletons(self.all_otus))
        # Create the centers file that is missing #
        pick_rep = sh.Command('pick_rep_set.py')
        pick_rep('-i', self.all_otus, '-f', self.reads, '-o', self.all_centers)
        # Remake the centers file without the filtered OTUs #
        self.otus_to_keep = [line.split()[0] for line in self.otus]
        def filter_otus(f):
            for seq in f:
                if seq.id in self.otus_to_keep: yield seq
        self.centers.write(filter_otus(self.all_centers))

    @property_cached
    def cluster_counts_table(self):
        """Create the unfiltered OTU table"""
        # Put results in a dict of dicts #
        result = defaultdict(lambda: defaultdict(int))
        # Loop #
        for line in self.otus:
            # Parse the line #
            contents = line.split()
            otu, reads = contents[0], contents[1:]
            # Parse the hits #
            for r in reads:
                nums = re.findall("run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)", r)
                if nums:
                    run_num, pool_num, sample_num, read_num = map(int, nums[0])
                    sample = illumitag.runs[run_num][pool_num-1][sample_num-1]
                    name = sample.short_name
                else:
                    nums = re.findall("run([0-9]+)_sample([0-9]+)_read([0-9]+)", r)
                    run_num, sample_num, read_num = map(int, nums[0])
                    sample = [s for s in illumitag.presamples+illumitag.pyrosamples if s.run_num==run_num and s.num==sample_num][0]
                    name = sample.short_name
                # Count #
                result[otu][name] += 1
        # Return #
        result = pandas.DataFrame(result)
        result = result.fillna(0)
        result = result.astype(int)
        result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1)
        return result
Example #21
0
 def __new__(cls, path=None, content=None, **kwargs):
     handle = open(path, 'w') if path else tempfile.NamedTemporaryFile(delete=False, **kwargs)
     if content: handle.write(content)
     handle.close()
     return FilePath.__new__(cls, handle.name)
Example #22
0
class JobSLURM(object):
    """Makes launching SLURM jobs easy to write and easy to use. Here are some
    examples on how to use this class:

        for command in ['print "hi"', 'print "hello"']:
            job = JobSLURM(command, time='00:01:00', qos='short')
            job.run()

        for path in ['~/data/scafolds1.txt', '~/data/scafolds2.txt', '~/data/scafolds3.txt']:
            command =  ['import sh\n']
            command += ['script = sh.Command("analyze.py")\n']
            command += ['script(%s)' % path]
            job = JobSLURM(command, time='00:01:00', qos='short', job_name=path[-25:])
            job.run()
            print "Job %i is running !" % job.id

    Then you can easily check the status of your job and the standard out that is associated:

        print job.status
        print job.log_tail
        print job.info['time_left']

    etc.
    """

    extensions = {
        'bash':   "sh",
        'python': "py"
    }

    shebang_headers = {
        'bash':   ["#!/bin/bash -le"],      # As a login shell and stop on error
        'python': ["#!/usr/bin/env python"]
    }

    slurm_headers = OrderedDict((
        ('job_name'   , {'tag': '#SBATCH -J %s',              'needed': True}),
        ('change_dir' , {'tag': '#SBATCH -D %s',              'needed': True,  'default': os.path.abspath(os.getcwd())}),
        ('out_file'   , {'tag': '#SBATCH -o %s',              'needed': True,  'default': '/dev/null'}),
        ('project'    , {'tag': '#SBATCH -A %s',              'needed': False, 'default': 'b2011035'}),
        ('time'       , {'tag': '#SBATCH -t %s',              'needed': True,  'default': '7-00:00:00'}),
        ('machines'   , {'tag': '#SBATCH -N %s',              'needed': True,  'default': '1'}),
        ('cores'      , {'tag': '#SBATCH -n %s',              'needed': True,  'default': num_processors}),
        ('partition'  , {'tag': '#SBATCH -p %s',              'needed': True,  'default': 'node'}),
        ('email'      , {'tag': '#SBATCH --mail-user %s',     'needed': False, 'default': os.environ.get('EMAIL')}),
        ('email-when' , {'tag': '#SBATCH --mail-type=%s',     'needed': True,  'default': 'END'}),
        ('qos'        , {'tag': '#SBATCH --qos=%s',           'needed': False, 'default': 'short'}),
        ('dependency' , {'tag': '#SBATCH -d %s',              'needed': False, 'default': 'afterok:1'}),
        ('constraint' , {'tag': '#SBATCH -C %s',              'needed': False, 'default': 'fat'}),
        ('cluster'    , {'tag': '#SBATCH -M %s',              'needed': False, 'default': 'milou'}),
        ('alloc'      , {'tag': '#SBATCH --reservation=%s',   'needed': False, 'default': 'workstation'}),
        ('jobid'      , {'tag': '#SBATCH --jobid=%i',         'needed': False, 'default': 2173455}),
        ('memory'     , {'tag': '#SBATCH --mem=%i',           'needed': False, 'default': 120000}),
        ('mem_per_cpu', {'tag': '#SBATCH --mem-per-cpu=%i',   'needed': False, 'default': 512}),
        ('threads'    , {'tag': '#SBATCH --cpus-per-task=%i', 'needed': False, 'default': num_processors}),
    ))

    script_headers = {
        'bash':   ['echo "SLURM: start at $(date) on $(hostname)"'],
        'python': ['import dateutil.tz, datetime, platform',
                   'now = datetime.datetime.now(dateutil.tz.tzlocal())',
                  r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")',
                   'node = platform.node()',
                   'print "SLURM: start at {0} on {1}".format(now, node)']}

    script_footers = {
        'bash':   ['echo "SLURM: end at $(date)"'],
        'python': ['now = datetime.datetime.now(dateutil.tz.tzlocal())',
                  r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")',
                   'print "SLURM: end at {0}".format(now)']}

    def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.name)

    @property
    def name(self): return self.kwargs['job_name']

    def __init__(self,
                 command     = ["print 'Hello world'"],
                 language    = 'python',
                 base_dir    = None,
                 script_path = None,
                 **kwargs):
        # Required attributes #
        self.command  = command
        self.language = language
        self.kwargs   = kwargs
        # Set the file paths #
        self.set_paths(base_dir, script_path)
        # Check command type #
        if not isinstance(self.command, list): self.command = [self.command]
        # Get the name #
        if 'job_name' not in self.kwargs:
            hashed  = hashlib.md5(''.join(self.command)).digest()
            encoded = base64.urlsafe_b64encode(hashed)
            self.kwargs['job_name'] = encoded
        # Check we have a project otherwise choose the one with less hours #
        if hostname.startswith('milou'):
            if 'project' not in self.kwargs and 'SBATCH_ACCOUNT' not in os.environ:
                if projects: self.kwargs['project'] = projects[0]['name']

    def set_paths(self, base_dir, script_path):
        """Set the directory, the script path and the outfile path"""
        # Make absolute paths #
        if 'change_dir' in self.kwargs:
            self.kwargs['change_dir'] = DirectoryPath(os.path.abspath(self.kwargs['change_dir']))
        if 'out_file' in self.kwargs:
            self.kwargs['out_file']   = FilePath(os.path.abspath(self.kwargs['out_file']))
        # In case there is a base directory #
        if base_dir is not None:
            self.base_dir             = DirectoryPath(os.path.abspath(base_dir))
            self.script_path          = FilePath(base_dir + "run." + self.extensions[self.language])
            self.kwargs['change_dir'] = base_dir
            self.kwargs['out_file']   = FilePath(base_dir + "run.out")
        # Other cases #
        if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path())
        if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path))

    @property_cached
    def slurm_params(self):
        """The list of parameters to give to the `sbatch` command."""
        # Main loop #
        result = OrderedDict()
        for param, info in self.slurm_headers.items():
            if not info['needed'] and not param in self.kwargs: continue
            if param in self.kwargs: result[param] = self.kwargs.get(param)
            else:                    result[param] = info['default']
        # Special cases #
        if result.get('cluster') == 'halvan': result['partition'] = 'halvan'
        # Return #
        return result

    @property
    def script(self):
        """The script to be submitted to the SLURM queue."""
        self.shebang_header = self.shebang_headers[self.language]
        self.slurm_header   = [self.slurm_headers[k]['tag'] % v for k,v in self.slurm_params.items()]
        self.script_header  = self.script_headers[self.language]
        self.script_footer  = self.script_footers[self.language]
        return '\n'.join(flatter([self.shebang_header,
                                  self.slurm_header,
                                  self.script_header,
                                  self.command,
                                  self.script_footer]))

    def make_script(self):
        """Make the script and return a FilePath object pointing to the script above."""
        self.script_path.write(self.script)
        self.script_path.permissions.make_executable()
        return self.script_path

    @property
    def log(self):
        """The log as a FilePath object"""
        return self.slurm_params['out_file']

    @property
    def log_tail(self):
        """If we have a log file, what is its tail"""
        if not self.kwargs['out_file'].exists: return False
        else: return tail(self.slurm_params['out_file'])

    @property
    def status(self):
        """What is the status of the job ?"""
        # If there is no script it is either ready or a lost duplicate #
        if not self.script_path.exists:
            if self.name in     jobs.names: return "DUPLICATE"
            if self.name not in jobs.names: return "READY"
        # It is submitted already #
        if self.name in jobs.names:
            if jobs[self.name]['type'] == 'queued':  return "QUEUED"
            if jobs[self.name]['type'] == 'running': return "RUNNING"
        # So the script exists for sure but it is not in the queue #
        if not self.kwargs['out_file'].exists: return "ABORTED"
        # Let's look in log file #
        if 'CANCELED'          in self.log_tail: return "CANCELLED"
        if 'slurmstepd: error' in self.log_tail: return "CANCELLED"
        # It all looks good #
        if 'SLURM: end at'     in self.log_tail: return "FINISHED"
        # At this point we have no idea #
        return "INTERUPTED"

    @property
    def info(self):
        """Get the existing job information dictionary"""
        if self.name not in jobs: return {'status': self.status}
        else:                     return jobs[self.name]

    #-------------------------------------------------------------------------#
    def run(self):
        """Will call self.launch() after performing some checks"""
        # Check already exists #
        if self.status == "READY": return self.launch()
        # Check name conflict #
        if self.status == "DUPLICATE":  message = "Job with same name '%s' already in queue, but we lost the script."
        if self.status == "QUEUED":     message = "Job '%s' already in queue."
        if self.status == "RUNNING":    message = "Job '%s' already running."
        if self.status == "FINISHED":   message = "Job '%s' already ended successfully."
        if self.status == "ABORTED":    message = "Job '%s' was killed without any output file (?)."
        if self.status == "CANCELED":   message = "Job '%s' was canceled or killed while running."
        if self.status == "INTERUPTED": message = "Job '%s' is not running. We don't know why. Look at the log file."
        print Color.i_red + message % (self.name,) + Color.end
        print "Job might have run already (?). Not starting."

    def launch(self):
        """Make the script file and return the newly created job id"""
        # Make script file #
        self.make_script()
        # Do it #
        sbatch_out = sh.sbatch(self.script_path)
        jobs.expire()
        # Message #
        print Color.i_blu + "SLURM:" + Color.end + " " + str(sbatch_out),
        # Return id #
        self.id = int(re.findall("Submitted batch job ([0-9]+)", str(sbatch_out))[0])
        return self.id

    def cancel(self):
        if self.status != "QUEUED" and self.status != "RUNNING":
            raise Exception("Can't cancel job '%s'" % self.name)
        sh.scancel(self.info['jobid'])

    def wait(self):
        """Wait until the job is finished"""
        pass

    #-------------------------------------------------------------------------#
    def run_locally(self):
        """A convenience method to run the same result as a SLURM job
        but locally in a non-blocking way. Useful for testing."""
        self.thread = threading.Thread(target=self.execute_locally)
        self.thread.daemon = True # So that they die when we die
        self.thread.start()

    def execute_locally(self):
        """Runs the equivalent command locally in a blocking way."""
        # Make script file #
        self.make_script()
        # Do it #
        with open(self.kwargs['out_file'], 'w') as handle:
            sh.python(self.script_path, _out=handle, _err=handle)

    def wait_locally(self):
        """If you have run the query in a non-blocking way, call this method to pause
        until the query is finished."""
        try: self.thread.join(sys.maxint) # maxint timeout so that we can Ctrl-C them
        except KeyboardInterrupt: print "Stopped waiting on job '%s'" % self.kwargs['job_name']
Example #23
0
 def to_qual(self, path):
     with open(path, 'w') as handle:
         for r in self:
             SeqIO.write(r, handle, 'qual')
     return FilePath(path)