class TaxaClassifier(object): """Can assign taxonomy to a FASTA file of random shotgun sequences.""" all_paths = """ /tree.txt /annot.txt /graph.xml /graph.png """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __init__(self): # Files # self.taxa_file = FilePath(self.base_dir + self.sample.name + '.txt') self.class_out_file = FilePath(self.base_dir + self.sample.type + '.txt') self.class_out_file.link_from(self.taxa_file, safe=True) # NMDS # self.coord = (0,0) def make_graph(self): # Convert # graphconv = sh.Command(home + "share/metaphlan/plotting_scripts/metaphlan2graphlan.py") graphconv(self.taxa_file, '--tree_file', self.p.tree, '--annot_file', self.p.annot) # Annotate # annotate = sh.Command(home + "share/graphlan/graphlan_annotate.py") annotate('--annot', self.p.annot, self.p.tree, self.p.xml) # Graph # graphlan = sh.Command(home + "share/graphlan/graphlan.py") graphlan('--dpi', 200, self.p.xml, self.p.png)
def overwrite_cache(self, value): # Where should we look in the file system ? # if 'cache_dir' in self.__dict__: path = FilePath(self.__dict__['cache_dir'] + f.func_name + '.pickle') else: path = getattr(self.p, f.func_name) if value is None: path.remove() else: raise Exception("You can't set a pickled property, you can only delete it")
class Project(Aggregate): """A project containing several samples. You can describe your projects in the a JSON placed in the repository.""" all_paths = Aggregate.all_paths + """ /info.json /samples/ """ def __repr__(self): return '<%s object "%s" with %i samples>' % \ (self.__class__.__name__, self.name, len(self)) def __init__(self, json_path, project_dir): # Parse the json file describing the project # self.json_path = FilePath(json_path) with open(json_path) as handle: self.info = json.load(handle) # Required parameters # self.num = self.info['project_num'] self.name = self.info['project_name'] self.long_name = self.info['project_long_name'] # Optional parameters # self.abstract = self.info.get('abstract') self.run_name = self.info.get('illumina_run_id') self.account = self.info.get('uppmax_project_id') # Base directory # self.base_dir = project_dir + self.name + '/' # Delayed init # self.loaded = False def load(self): """A delayed kind of __init__ that is not called right away to avoid crowding the RAM of the python interpreter when you just import gefes""" # Load # self.loaded = True # Automatic paths # self.p = AutoPaths(self.base_dir, self.all_paths) # Make an alias to the json # self.json_path.link_to(self.p.info_json, safe=True) # Make all the samples object that this project possesses # if self.info.get('auto_parse_samples'): search_dir = self.info['samples_base_dir'] search_dir = os.path.expanduser(search_dir) paths = glob.glob(search_dir + '*.fastq*') if not paths: paths = glob.glob(search_dir + '*.fasta*') if not paths: raise Exception("Found no FASTA or FASTQ path in %s" % search_dir) if all([re.search("_R[12]_", p) for p in paths]): pairs = join_paired_filepaths(paths) else: pairs = sort_string_by_pairs(paths) self.samples = [Sample(self, p[0], p[1], num=i) for i,p in enumerate(pairs)] self.samples.sort(key=lambda x: natural_sort(x.name)) else: self.samples = [Sample(self, info=info) for info in self.info['samples']] # The samples of a project are it's children in a way # self.children = self.samples # Call the mother function # return Aggregate.load(self)
def __init__(self): # Files # self.taxa_file = FilePath(self.base_dir + self.sample.name + '.txt') self.class_out_file = FilePath(self.base_dir + self.sample.type + '.txt') self.class_out_file.link_from(self.taxa_file, safe=True) # NMDS # self.coord = (0,0)
def set_paths(self, base_dir, script_path): """Set the directory, the script path and the outfile path""" # Make absolute paths # if 'change_dir' in self.kwargs: self.kwargs['change_dir'] = DirectoryPath(os.path.abspath(self.kwargs['change_dir'])) if 'out_file' in self.kwargs: self.kwargs['out_file'] = FilePath(os.path.abspath(self.kwargs['out_file'])) # In case there is a base directory # if base_dir is not None: self.base_dir = DirectoryPath(os.path.abspath(base_dir)) self.script_path = FilePath(base_dir + "run." + self.extensions[self.language]) self.kwargs['change_dir'] = base_dir self.kwargs['out_file'] = FilePath(base_dir + "run.out") # Other cases # if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path()) if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path))
def build_tree_raxml(self, new_path=None, seq_type='nucl' or 'prot', num_threads=None, free_cores=2, keep_dir=False): """Make a tree with RAxML.""" # Check output # if new_path is None: new_path = self.prefix_path + '.tree' # What model to choose # if seq_type == 'nucl': model = "GTRGAMMA" if seq_type == 'prot': model = "PROTGAMMAJTTF" # Threads # if num_threads is None: num_threads = multiprocessing.cpu_count() - free_cores else: num_threads = int(num_threads) - free_cores num_threads = max(1, num_threads) # Run it # temp_dir = new_temp_dir() sh.raxml811('-m', model, "-T", num_threads, '-p', 1, '-s', self.path, '-n', 'tree', '-w', temp_dir, '-f', 'a', '-x', 1, '-N', 'autoMR') # Move into place # if keep_dir: shutil.rmtree(new_path) shutil.move(temp_dir, new_path) if not keep_dir: shutil.move(temp_dir + 'RAxML_bestTree.tree', new_path) # Return # return FilePath(new_path)
def __init__(self, base_dir=None): # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # The results # self.alignment = FASTA(self.p.mothur_fasta) self.taxonomy = FilePath(self.p.mothur_tax) # The part that mothur will use for naming files # self.nickname = "foram_mothur"
def build_tree_fast(self, new_path=None, seq_type='nucl' or 'prot'): """Make a tree with FastTree. Names will be truncated however.""" # Check output # if new_path is None: new_path = self.prefix_path + '.tree' # Command # command_args = [] if seq_type == 'nucl': command_args += ['-nt'] command_args += ['-gamma'] command_args += ['-out', new_path] command_args += [self.path] # Run it # sh.FastTree(*command_args) # Return # return FilePath(new_path)
def __init__(self, json_path, project_dir): # Parse the json file describing the project # self.json_path = FilePath(json_path) with open(json_path) as handle: self.info = json.load(handle) # Required parameters # self.num = self.info['project_num'] self.name = self.info['project_name'] self.long_name = self.info['project_long_name'] # Optional parameters # self.abstract = self.info.get('abstract') self.run_name = self.info.get('illumina_run_id') self.account = self.info.get('uppmax_project_id') # Base directory # self.base_dir = project_dir + self.name + '/' # Delayed init # self.loaded = False
def tree(self): """The path to the tree built with raxml""" tree = FilePath(self.p.tree_dir + 'RAxML_bestTree.tree') if not tree.exists: # Check we can do it # if self.gaps_in_alignment: message = "Can't build a tree for cluster %i because of gaps. Skipping." warnings.warn(message % self.num) return None # Do it # self.alignment.build_tree(new_path=self.p.tree_dir, seq_type=self.analysis.seq_type, num_threads=self.analysis.num_threads, free_cores=0, keep_dir=True) return tree
def set_paths(self, base_dir, script_path): """Set the directory, the script path and the outfile path""" # Make absolute paths # if 'change_dir' in self.kwargs: self.kwargs['change_dir'] = DirectoryPath( os.path.abspath(self.kwargs['change_dir'])) if 'out_file' in self.kwargs: self.kwargs['out_file'] = FilePath( os.path.abspath(self.kwargs['out_file'])) # In case there is a base directory # if base_dir is not None: self.base_dir = DirectoryPath(os.path.abspath(base_dir)) self.script_path = FilePath(base_dir + "run." + self.extensions[self.language]) self.kwargs['change_dir'] = base_dir self.kwargs['out_file'] = FilePath(base_dir + "run.out") # Other cases # if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path()) if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path))
def __init__(self, path, parent): # Save parent # self.parent, self.pool = parent, parent self.samples = parent.samples # Auto paths # self.base_dir = parent.p.quality_dir + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Files # self.untrimmed = BarcodedFASTQ(path, samples=self.samples) self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples) self.trimmed = FASTA(self.p.trimmed) # Qiime output # self.qiime_fasta = FASTA(self.p.qiime_fasta) # Mothur # self.mothur_fasta = FASTA(self.p.mothur_fasta) self.mothur_qual = QualFile(self.p.mothur_qual) self.mothur_groups = FilePath(self.p.mothur_groups) # Primer size # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev
def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.all_otus = FilePath(self.p.all_otus) self.all_centers = FASTA(self.p.all_centers) self.otus = FilePath(self.base_dir + "otus.txt") self.centers = FASTA(self.base_dir + "centers.fasta") # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva
def retrieve_from_cache(self): # Is it in the cache ? # if '__cache__' not in self.__dict__: self.__cache__ = {} if f.__name__ in self.__cache__: return self.__cache__[f.__name__] # Where should we look in the file system ? # if 'cache_dir' in self.__dict__: path = FilePath(self.__dict__['cache_dir'] + f.func_name + '.pickle') else: path = getattr(self.p, f.func_name) # Is it on disk ? # if path.exists: with open(path) as handle: result = pickle.load(handle) self.__cache__[f.__name__] = result return result # Otherwise let's compute it # result = f(self) with open(path, 'w') as handle: pickle.dump(result, handle) self.__cache__[f.__name__] = result return result
class Foraminifera(Database): """This is a custom database containing exlcusively Foraminifera sequences. https://genev.unige.ch/research/laboratory/Jan-Pawlowski You should place the file "foram_db_cor.fasta" in: ~/databases/foraminifera/ Then you can run this: from seqsearch.databases.foraminifera import foraminifera foraminifera.process() print foraminifera.tax_depth_freq """ short_name = "foraminifera" long_name = 'The custom made Foraminifera database as received by email on 7th April 2017' all_paths = """ /foram_db_cor.fasta /foram_mothur.fasta /foram_mothur.tax """ @property def rank_names(self): """The names of the ranks. Total 9 ranks.""" return ['Domain', # 0 'Kingdom', # 1 'Phylum', # 2 'Class', # 3 'Order', # 4 'Family', # 5 'Tribe', # 6 'Genus', # 7 'Species'] # 8 def __init__(self, base_dir=None): # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # The results # self.alignment = FASTA(self.p.mothur_fasta) self.taxonomy = FilePath(self.p.mothur_tax) # The part that mothur will use for naming files # self.nickname = "foram_mothur" def process(self): # The file that was received by email without documentation T_T # raw = FASTA(self.p.cor) # Open files # self.alignment.create() self.taxonomy.create() # Loop # for seq in raw: # Parse # name = seq.id[11:].split('|') num = name.pop(0) # Check # for x in name: assert ';' not in x for x in name: assert '\t' not in x # Make ranks # ranks = ['Eukaryota' , # 0 Domain 'Rhizaria' , # 1 Kingdom 'Foraminifera' , # 2 Phylum name[0] , # 3 Class name[1] , # 4 Order name[2] , # 5 Family name[3] , # 6 Tribe name[4] , # 7 Genus name[5]] # 8 Species # The taxonomy string # tax_line = ';'.join(ranks) # Add sequence to the new fasta file # self.alignment.add_str(str(seq.seq), name="foram" + num) # Add the taxonomy to the tax file # self.taxonomy.add_str("foram" + num + '\t' + tax_line + '\n') # Close files # self.alignment.close() self.taxonomy.close()
def web_export(self): """Copy the report to the webexport directory where it can be viewed by anyone""" dest = FilePath(("/proj/%s/webexport/" + self.location) % self.sample.account) dest.make_directory() shutil.copy(self.output_path, dest)
class JobSLURM(object): """Makes launching SLURM jobs easy to write and easy to use. Here are some examples on how to use this class: for command in ['print "hi"', 'print "hello"']: job = JobSLURM(command, time='00:01:00', qos='short') job.run() for path in ['~/data/scafolds1.txt', '~/data/scafolds2.txt', '~/data/scafolds3.txt']: command = ['import sh\n'] command += ['script = sh.Command("analyze.py")\n'] command += ['script(%s)' % path] job = JobSLURM(command, time='00:01:00', qos='short', job_name=path[-25:]) job.run() print "Job %i is running !" % job.id Then you can easily check the status of your job and the standard out that is associated: print job.status print job.log_tail print job.info['time_left'] etc. """ extensions = {'bash': "sh", 'python': "py"} shebang_headers = { 'bash': ["#!/bin/bash -le"], # As a login shell and stop on error 'python': ["#!/usr/bin/env python"] } slurm_headers = OrderedDict(( ('job_name', { 'tag': '#SBATCH -J %s', 'needed': True }), ('change_dir', { 'tag': '#SBATCH -D %s', 'needed': True, 'default': os.path.abspath(os.getcwd()) }), ('out_file', { 'tag': '#SBATCH -o %s', 'needed': True, 'default': '/dev/null' }), ('project', { 'tag': '#SBATCH -A %s', 'needed': False, 'default': 'b2011035' }), ('time', { 'tag': '#SBATCH -t %s', 'needed': True, 'default': '7-00:00:00' }), ('machines', { 'tag': '#SBATCH -N %s', 'needed': True, 'default': '1' }), ('cores', { 'tag': '#SBATCH -n %s', 'needed': True, 'default': num_processors }), ('partition', { 'tag': '#SBATCH -p %s', 'needed': True, 'default': 'node' }), ('email', { 'tag': '#SBATCH --mail-user %s', 'needed': False, 'default': os.environ.get('EMAIL') }), ('email-when', { 'tag': '#SBATCH --mail-type=%s', 'needed': True, 'default': 'END' }), ('qos', { 'tag': '#SBATCH --qos=%s', 'needed': False, 'default': 'short' }), ('dependency', { 'tag': '#SBATCH -d %s', 'needed': False, 'default': 'afterok:1' }), ('constraint', { 'tag': '#SBATCH -C %s', 'needed': False, 'default': 'fat' }), ('cluster', { 'tag': '#SBATCH -M %s', 'needed': False, 'default': 'milou' }), ('alloc', { 'tag': '#SBATCH --reservation=%s', 'needed': False, 'default': 'workstation' }), ('jobid', { 'tag': '#SBATCH --jobid=%i', 'needed': False, 'default': 2173455 }), ('memory', { 'tag': '#SBATCH --mem=%i', 'needed': False, 'default': 120000 }), ('mem_per_cpu', { 'tag': '#SBATCH --mem-per-cpu=%i', 'needed': False, 'default': 512 }), ('threads', { 'tag': '#SBATCH --cpus-per-task=%i', 'needed': False, 'default': num_processors }), )) script_headers = { 'bash': ['echo "SLURM: start at $(date) on $(hostname)"'], 'python': [ 'import dateutil.tz, datetime, platform', 'now = datetime.datetime.now(dateutil.tz.tzlocal())', r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")', 'node = platform.node()', 'print "SLURM: start at {0} on {1}".format(now, node)' ] } script_footers = { 'bash': ['echo "SLURM: end at $(date)"'], 'python': [ 'now = datetime.datetime.now(dateutil.tz.tzlocal())', r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")', 'print "SLURM: end at {0}".format(now)' ] } def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.name) @property def name(self): return self.kwargs['job_name'] def __init__(self, command=["print 'Hello world'"], language='python', base_dir=None, script_path=None, **kwargs): # Required attributes # self.command = command self.language = language self.kwargs = kwargs # Set the file paths # self.set_paths(base_dir, script_path) # Check command type # if not isinstance(self.command, list): self.command = [self.command] # Get the name # if 'job_name' not in self.kwargs: hashed = hashlib.md5(''.join(self.command)).digest() encoded = base64.urlsafe_b64encode(hashed) self.kwargs['job_name'] = encoded # Check we have a project otherwise choose the one with less hours # if hostname.startswith('milou'): if 'project' not in self.kwargs and 'SBATCH_ACCOUNT' not in os.environ: if projects: self.kwargs['project'] = projects[0]['name'] def set_paths(self, base_dir, script_path): """Set the directory, the script path and the outfile path""" # Make absolute paths # if 'change_dir' in self.kwargs: self.kwargs['change_dir'] = DirectoryPath( os.path.abspath(self.kwargs['change_dir'])) if 'out_file' in self.kwargs: self.kwargs['out_file'] = FilePath( os.path.abspath(self.kwargs['out_file'])) # In case there is a base directory # if base_dir is not None: self.base_dir = DirectoryPath(os.path.abspath(base_dir)) self.script_path = FilePath(base_dir + "run." + self.extensions[self.language]) self.kwargs['change_dir'] = base_dir self.kwargs['out_file'] = FilePath(base_dir + "run.out") # Other cases # if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path()) if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path)) @property_cached def slurm_params(self): """The list of parameters to give to the `sbatch` command.""" # Main loop # result = OrderedDict() for param, info in self.slurm_headers.items(): if not info['needed'] and not param in self.kwargs: continue if param in self.kwargs: result[param] = self.kwargs.get(param) else: result[param] = info['default'] # Special cases # if result.get('cluster') == 'halvan': result['partition'] = 'halvan' # Return # return result @property def script(self): """The script to be submitted to the SLURM queue.""" self.shebang_header = self.shebang_headers[self.language] self.slurm_header = [ self.slurm_headers[k]['tag'] % v for k, v in self.slurm_params.items() ] self.script_header = self.script_headers[self.language] self.script_footer = self.script_footers[self.language] return '\n'.join( flatter([ self.shebang_header, self.slurm_header, self.script_header, self.command, self.script_footer ])) def make_script(self): """Make the script and return a FilePath object pointing to the script above.""" self.script_path.write(self.script) self.script_path.permissions.make_executable() return self.script_path @property def log(self): """The log as a FilePath object""" return self.slurm_params['out_file'] @property def log_tail(self): """If we have a log file, what is its tail""" if not self.kwargs['out_file'].exists: return False else: return tail(self.slurm_params['out_file']) @property def status(self): """What is the status of the job ?""" # If there is no script it is either ready or a lost duplicate # if not self.script_path.exists: if self.name in jobs.names: return "DUPLICATE" if self.name not in jobs.names: return "READY" # It is submitted already # if self.name in jobs.names: if jobs[self.name]['type'] == 'queued': return "QUEUED" if jobs[self.name]['type'] == 'running': return "RUNNING" # So the script exists for sure but it is not in the queue # if not self.kwargs['out_file'].exists: return "ABORTED" # Let's look in log file # if 'CANCELED' in self.log_tail: return "CANCELLED" if 'slurmstepd: error' in self.log_tail: return "CANCELLED" # It all looks good # if 'SLURM: end at' in self.log_tail: return "FINISHED" # At this point we have no idea # return "INTERUPTED" @property def info(self): """Get the existing job information dictionary""" if self.name not in jobs: return {'status': self.status} else: return jobs[self.name] #-------------------------------------------------------------------------# def run(self): """Will call self.launch() after performing some checks""" # Check already exists # if self.status == "READY": return self.launch() # Check name conflict # if self.status == "DUPLICATE": message = "Job with same name '%s' already in queue, but we lost the script." if self.status == "QUEUED": message = "Job '%s' already in queue." if self.status == "RUNNING": message = "Job '%s' already running." if self.status == "FINISHED": message = "Job '%s' already ended successfully." if self.status == "ABORTED": message = "Job '%s' was killed without any output file (?)." if self.status == "CANCELED": message = "Job '%s' was canceled or killed while running." if self.status == "INTERUPTED": message = "Job '%s' is not running. We don't know why. Look at the log file." print Color.i_red + message % (self.name, ) + Color.end print "Job might have run already (?). Not starting." def launch(self): """Make the script file and return the newly created job id""" # Make script file # self.make_script() # Do it # sbatch_out = sh.sbatch(self.script_path) jobs.expire() # Message # print Color.i_blu + "SLURM:" + Color.end + " " + str(sbatch_out), # Return id # self.id = int( re.findall("Submitted batch job ([0-9]+)", str(sbatch_out))[0]) return self.id def cancel(self): if self.status != "QUEUED" and self.status != "RUNNING": raise Exception("Can't cancel job '%s'" % self.name) sh.scancel(self.info['jobid']) def wait(self): """Wait until the job is finished""" pass #-------------------------------------------------------------------------# def run_locally(self): """A convenience method to run the same result as a SLURM job but locally in a non-blocking way. Useful for testing.""" self.thread = threading.Thread(target=self.execute_locally) self.thread.daemon = True # So that they die when we die self.thread.start() def execute_locally(self): """Runs the equivalent command locally in a blocking way.""" # Make script file # self.make_script() # Do it # with open(self.kwargs['out_file'], 'w') as handle: sh.python(self.script_path, _out=handle, _err=handle) def wait_locally(self): """If you have run the query in a non-blocking way, call this method to pause until the query is finished.""" try: self.thread.join( sys.maxint) # maxint timeout so that we can Ctrl-C them except KeyboardInterrupt: print "Stopped waiting on job '%s'" % self.kwargs['job_name']
class QualityReads(object): """A set of sequences determined to be quality controlled""" all_paths = """ /mothur_reads.fasta /mothur_reads.qual /mothur_groups.tsv /qiime_reads.fasta /only_used_samples.fasta /trimmed.fasta """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return len(self.trimmed) def __init__(self, path, parent): # Save parent # self.parent, self.pool = parent, parent self.samples = parent.samples # Auto paths # self.base_dir = parent.p.quality_dir + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Files # self.untrimmed = BarcodedFASTQ(path, samples=self.samples) self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples) self.trimmed = FASTA(self.p.trimmed) # Qiime output # self.qiime_fasta = FASTA(self.p.qiime_fasta) # Mothur # self.mothur_fasta = FASTA(self.p.mothur_fasta) self.mothur_qual = QualFile(self.p.mothur_qual) self.mothur_groups = FilePath(self.p.mothur_groups) # Primer size # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev def filter_unused(self): def no_unused_iterator(reads): for r in reads.parse_barcodes(): if r.first.sample.used: yield r.read self.only_used.write(no_unused_iterator(self.untrimmed)) def trim_primers(self): def no_primers_iterator(reads): for read in reads: yield read[self.trim_fwd:-self.trim_rev] self.trimmed.write(no_primers_iterator(self.only_used)) def make_mothur_output(self): # Trimmed fasta # self.mothur_fasta.link_from(self.trimmed.path) # The groups file # self.mothur_groups.create() for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name read_name = '%s\t%s\n' % (r.read.id, sample_name) self.mothur_groups.handle.write(read_name) self.mothur_groups.close() def make_qiime_output(self): # Prepare fasta writer # handle = open(self.qiime_fasta.path, 'w') writer = FastaWriter(handle, wrap=0) writer.write_header() # Counter # counter = defaultdict(int) # Do it # for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name counter[sample_name] += 1 r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id) bar_seq = r.read.seq[0:self.pool.bar_len] r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq) writer.write_record(r.read[self.trim_fwd:-self.trim_rev]) # Close # writer.write_footer() handle.close()
def __init__(self, parent): self.parent = parent self.path = FilePath(self.parent.prefix_path + '_len_hist.pdf')
class UclustOTUs(OTUs): """Will use uclust via the qimme wraper to create OTU clusters from a given FASTA file http://qiime.org/scripts/pick_otus.html""" short_name = 'uclust' title = 'UCLUST-QIIME denovo picking' all_paths = """ /clusters/clusters.uc /clusters/qiime.log /clusters/all_otus.txt /clusters/all_centers.fasta /centers.fasta /otus.txt /taxonomy_silva/ /taxonomy_fw/ /graphs/ """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.all_otus = FilePath(self.p.all_otus) self.all_centers = FASTA(self.p.all_centers) self.otus = FilePath(self.base_dir + "otus.txt") self.centers = FASTA(self.base_dir + "centers.fasta") # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva def run(self): # Clean # shutil.rmtree(self.p.clusters_dir) # Run command # pick_otus = sh.Command('pick_otus.py') pick_otus('-m', 'uclust', '-i', self.reads, '-o', self.p.clusters_dir) # Move into place # base_name = self.p.clusters_dir + self.reads.prefix shutil.move(base_name + '_otus.txt', self.all_otus) shutil.move(base_name + '_otus.log', self.p.qiime_log) shutil.move(base_name + '_clusters.uc', self.p.clusters_uc) # Remove OTUs that are only one read # def filter_singletons(f): for line in f: line = line.split() if len(line) > 2: yield '\t'.join(line) + '\n' self.otus.writelines(filter_singletons(self.all_otus)) # Create the centers file that is missing # pick_rep = sh.Command('pick_rep_set.py') pick_rep('-i', self.all_otus, '-f', self.reads, '-o', self.all_centers) # Remake the centers file without the filtered OTUs # self.otus_to_keep = [line.split()[0] for line in self.otus] def filter_otus(f): for seq in f: if seq.id in self.otus_to_keep: yield seq self.centers.write(filter_otus(self.all_centers)) @property_cached def cluster_counts_table(self): """Create the unfiltered OTU table""" # Put results in a dict of dicts # result = defaultdict(lambda: defaultdict(int)) # Loop # for line in self.otus: # Parse the line # contents = line.split() otu, reads = contents[0], contents[1:] # Parse the hits # for r in reads: nums = re.findall("run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)", r) if nums: run_num, pool_num, sample_num, read_num = map(int, nums[0]) sample = illumitag.runs[run_num][pool_num-1][sample_num-1] name = sample.short_name else: nums = re.findall("run([0-9]+)_sample([0-9]+)_read([0-9]+)", r) run_num, sample_num, read_num = map(int, nums[0]) sample = [s for s in illumitag.presamples+illumitag.pyrosamples if s.run_num==run_num and s.num==sample_num][0] name = sample.short_name # Count # result[otu][name] += 1 # Return # result = pandas.DataFrame(result) result = result.fillna(0) result = result.astype(int) result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1) return result
def __new__(cls, path=None, content=None, **kwargs): handle = open(path, 'w') if path else tempfile.NamedTemporaryFile(delete=False, **kwargs) if content: handle.write(content) handle.close() return FilePath.__new__(cls, handle.name)
class JobSLURM(object): """Makes launching SLURM jobs easy to write and easy to use. Here are some examples on how to use this class: for command in ['print "hi"', 'print "hello"']: job = JobSLURM(command, time='00:01:00', qos='short') job.run() for path in ['~/data/scafolds1.txt', '~/data/scafolds2.txt', '~/data/scafolds3.txt']: command = ['import sh\n'] command += ['script = sh.Command("analyze.py")\n'] command += ['script(%s)' % path] job = JobSLURM(command, time='00:01:00', qos='short', job_name=path[-25:]) job.run() print "Job %i is running !" % job.id Then you can easily check the status of your job and the standard out that is associated: print job.status print job.log_tail print job.info['time_left'] etc. """ extensions = { 'bash': "sh", 'python': "py" } shebang_headers = { 'bash': ["#!/bin/bash -le"], # As a login shell and stop on error 'python': ["#!/usr/bin/env python"] } slurm_headers = OrderedDict(( ('job_name' , {'tag': '#SBATCH -J %s', 'needed': True}), ('change_dir' , {'tag': '#SBATCH -D %s', 'needed': True, 'default': os.path.abspath(os.getcwd())}), ('out_file' , {'tag': '#SBATCH -o %s', 'needed': True, 'default': '/dev/null'}), ('project' , {'tag': '#SBATCH -A %s', 'needed': False, 'default': 'b2011035'}), ('time' , {'tag': '#SBATCH -t %s', 'needed': True, 'default': '7-00:00:00'}), ('machines' , {'tag': '#SBATCH -N %s', 'needed': True, 'default': '1'}), ('cores' , {'tag': '#SBATCH -n %s', 'needed': True, 'default': num_processors}), ('partition' , {'tag': '#SBATCH -p %s', 'needed': True, 'default': 'node'}), ('email' , {'tag': '#SBATCH --mail-user %s', 'needed': False, 'default': os.environ.get('EMAIL')}), ('email-when' , {'tag': '#SBATCH --mail-type=%s', 'needed': True, 'default': 'END'}), ('qos' , {'tag': '#SBATCH --qos=%s', 'needed': False, 'default': 'short'}), ('dependency' , {'tag': '#SBATCH -d %s', 'needed': False, 'default': 'afterok:1'}), ('constraint' , {'tag': '#SBATCH -C %s', 'needed': False, 'default': 'fat'}), ('cluster' , {'tag': '#SBATCH -M %s', 'needed': False, 'default': 'milou'}), ('alloc' , {'tag': '#SBATCH --reservation=%s', 'needed': False, 'default': 'workstation'}), ('jobid' , {'tag': '#SBATCH --jobid=%i', 'needed': False, 'default': 2173455}), ('memory' , {'tag': '#SBATCH --mem=%i', 'needed': False, 'default': 120000}), ('mem_per_cpu', {'tag': '#SBATCH --mem-per-cpu=%i', 'needed': False, 'default': 512}), ('threads' , {'tag': '#SBATCH --cpus-per-task=%i', 'needed': False, 'default': num_processors}), )) script_headers = { 'bash': ['echo "SLURM: start at $(date) on $(hostname)"'], 'python': ['import dateutil.tz, datetime, platform', 'now = datetime.datetime.now(dateutil.tz.tzlocal())', r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")', 'node = platform.node()', 'print "SLURM: start at {0} on {1}".format(now, node)']} script_footers = { 'bash': ['echo "SLURM: end at $(date)"'], 'python': ['now = datetime.datetime.now(dateutil.tz.tzlocal())', r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")', 'print "SLURM: end at {0}".format(now)']} def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.name) @property def name(self): return self.kwargs['job_name'] def __init__(self, command = ["print 'Hello world'"], language = 'python', base_dir = None, script_path = None, **kwargs): # Required attributes # self.command = command self.language = language self.kwargs = kwargs # Set the file paths # self.set_paths(base_dir, script_path) # Check command type # if not isinstance(self.command, list): self.command = [self.command] # Get the name # if 'job_name' not in self.kwargs: hashed = hashlib.md5(''.join(self.command)).digest() encoded = base64.urlsafe_b64encode(hashed) self.kwargs['job_name'] = encoded # Check we have a project otherwise choose the one with less hours # if hostname.startswith('milou'): if 'project' not in self.kwargs and 'SBATCH_ACCOUNT' not in os.environ: if projects: self.kwargs['project'] = projects[0]['name'] def set_paths(self, base_dir, script_path): """Set the directory, the script path and the outfile path""" # Make absolute paths # if 'change_dir' in self.kwargs: self.kwargs['change_dir'] = DirectoryPath(os.path.abspath(self.kwargs['change_dir'])) if 'out_file' in self.kwargs: self.kwargs['out_file'] = FilePath(os.path.abspath(self.kwargs['out_file'])) # In case there is a base directory # if base_dir is not None: self.base_dir = DirectoryPath(os.path.abspath(base_dir)) self.script_path = FilePath(base_dir + "run." + self.extensions[self.language]) self.kwargs['change_dir'] = base_dir self.kwargs['out_file'] = FilePath(base_dir + "run.out") # Other cases # if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path()) if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path)) @property_cached def slurm_params(self): """The list of parameters to give to the `sbatch` command.""" # Main loop # result = OrderedDict() for param, info in self.slurm_headers.items(): if not info['needed'] and not param in self.kwargs: continue if param in self.kwargs: result[param] = self.kwargs.get(param) else: result[param] = info['default'] # Special cases # if result.get('cluster') == 'halvan': result['partition'] = 'halvan' # Return # return result @property def script(self): """The script to be submitted to the SLURM queue.""" self.shebang_header = self.shebang_headers[self.language] self.slurm_header = [self.slurm_headers[k]['tag'] % v for k,v in self.slurm_params.items()] self.script_header = self.script_headers[self.language] self.script_footer = self.script_footers[self.language] return '\n'.join(flatter([self.shebang_header, self.slurm_header, self.script_header, self.command, self.script_footer])) def make_script(self): """Make the script and return a FilePath object pointing to the script above.""" self.script_path.write(self.script) self.script_path.permissions.make_executable() return self.script_path @property def log(self): """The log as a FilePath object""" return self.slurm_params['out_file'] @property def log_tail(self): """If we have a log file, what is its tail""" if not self.kwargs['out_file'].exists: return False else: return tail(self.slurm_params['out_file']) @property def status(self): """What is the status of the job ?""" # If there is no script it is either ready or a lost duplicate # if not self.script_path.exists: if self.name in jobs.names: return "DUPLICATE" if self.name not in jobs.names: return "READY" # It is submitted already # if self.name in jobs.names: if jobs[self.name]['type'] == 'queued': return "QUEUED" if jobs[self.name]['type'] == 'running': return "RUNNING" # So the script exists for sure but it is not in the queue # if not self.kwargs['out_file'].exists: return "ABORTED" # Let's look in log file # if 'CANCELED' in self.log_tail: return "CANCELLED" if 'slurmstepd: error' in self.log_tail: return "CANCELLED" # It all looks good # if 'SLURM: end at' in self.log_tail: return "FINISHED" # At this point we have no idea # return "INTERUPTED" @property def info(self): """Get the existing job information dictionary""" if self.name not in jobs: return {'status': self.status} else: return jobs[self.name] #-------------------------------------------------------------------------# def run(self): """Will call self.launch() after performing some checks""" # Check already exists # if self.status == "READY": return self.launch() # Check name conflict # if self.status == "DUPLICATE": message = "Job with same name '%s' already in queue, but we lost the script." if self.status == "QUEUED": message = "Job '%s' already in queue." if self.status == "RUNNING": message = "Job '%s' already running." if self.status == "FINISHED": message = "Job '%s' already ended successfully." if self.status == "ABORTED": message = "Job '%s' was killed without any output file (?)." if self.status == "CANCELED": message = "Job '%s' was canceled or killed while running." if self.status == "INTERUPTED": message = "Job '%s' is not running. We don't know why. Look at the log file." print Color.i_red + message % (self.name,) + Color.end print "Job might have run already (?). Not starting." def launch(self): """Make the script file and return the newly created job id""" # Make script file # self.make_script() # Do it # sbatch_out = sh.sbatch(self.script_path) jobs.expire() # Message # print Color.i_blu + "SLURM:" + Color.end + " " + str(sbatch_out), # Return id # self.id = int(re.findall("Submitted batch job ([0-9]+)", str(sbatch_out))[0]) return self.id def cancel(self): if self.status != "QUEUED" and self.status != "RUNNING": raise Exception("Can't cancel job '%s'" % self.name) sh.scancel(self.info['jobid']) def wait(self): """Wait until the job is finished""" pass #-------------------------------------------------------------------------# def run_locally(self): """A convenience method to run the same result as a SLURM job but locally in a non-blocking way. Useful for testing.""" self.thread = threading.Thread(target=self.execute_locally) self.thread.daemon = True # So that they die when we die self.thread.start() def execute_locally(self): """Runs the equivalent command locally in a blocking way.""" # Make script file # self.make_script() # Do it # with open(self.kwargs['out_file'], 'w') as handle: sh.python(self.script_path, _out=handle, _err=handle) def wait_locally(self): """If you have run the query in a non-blocking way, call this method to pause until the query is finished.""" try: self.thread.join(sys.maxint) # maxint timeout so that we can Ctrl-C them except KeyboardInterrupt: print "Stopped waiting on job '%s'" % self.kwargs['job_name']
def to_qual(self, path): with open(path, 'w') as handle: for r in self: SeqIO.write(r, handle, 'qual') return FilePath(path)