class ETE3PAML(object): """Use ETE3's M1 model to run PAML's codeml for orthology inference.""" def __init__(self, alignmentfile, speciestree, workdir=''): """Initialize main variables/files to be used. :param alignmentfile: Input alignment file in fasta format :param speciestree: A newick formatted species tree. :param workdir: Directory of alignment file and species tree. (Default value = '') """ self.ete3paml_log = LogIt().default(logname="ete3paml", logfile=None) self.alignmentfile = alignmentfile self.speciestree = speciestree self.workdir = workdir # Import your species tree self._speciestree = Tree(self.speciestree, format=1) # TODO import organisms list # Import alignment file as string alignment_file = open(self.alignmentfile, 'r') alignment_str = alignment_file.read() self.aln_str = alignment_str alignment_file.close() def prune_tree(self, organisms): """Prune branches for species not in the alignment file. Keep branches in the species tree for species in the alignment file Some species may not be present in the alignment file due to lack of matching with blast or simply the gene not being in the genome. :param organisms: A list of organisms in the alignment file. """ if os.path.isfile(organisms): organismslist = csvtolist(organisms) else: organismslist = organisms branches2keep = [] for organism in organismslist: if organism in self.aln_str: branches2keep.append(organism) else: self.ete3paml_log.warning('No sequence for %s.' % organism) self._speciestree.prune(branches2keep, preserve_branch_length=True) # Write the tree to a file self._speciestree.write( outfile=os.path.join(self.workdir, 'temptree.nw')) self.ete3paml_log.info('temptree.nw was created.') def run(self, pamlsrc, output_folder, model='M1'): """Run PAML using ETE. The default model is M1 as it is best for orthology inference in our case. You can use models `M2`, `M0`, `M3`. Ensure that you have the correct path to your codeml binary. It should be in the paml `/bin`. :param pamlsrc: Path to the codemly binary. :param output_folder: The name of the output folder. :param model: The model to be used. (Default value = 'M1') """ # Import the newick tree tree = EvolTree('temptree.nw') # Import the alignment tree.link_to_alignment(self.alignmentfile) tree.workdir = self.workdir # Set the binpath of the codeml binary tree.execpath = pamlsrc # Run the model M1, M2, M3, or M0 model_path = model + '.' + output_folder tree.run_model(model_path) self.ete3paml_log.info('Codeml is generating data in %s.' % model_path)
def archive(database_path, archive_path, option, delete_flag=False): """Archive a database directory from a Cookie templated directory structure. This utility creates a YAML config dictionary that contains path-like objects for archiving. The original data can be moved to the archive path or deleted all together. :param database_path: A path to a folder that consists of the desired data. :param archive_path: A path to an output folder for archived data. :param option: An option for the archiving strategy. Will be one of the keys in the archive_options. :param delete_flag: A flag for deleting the original data. USE WITH CAUTION. :return: Returns a list of paths to the *.tar.xz archive of the data and/or a path to the original data. """ archive_dict = {} archive_list = [] archive_log = LogIt().default(logname="Archive", logfile=None) if option == "Full": full_path = Path(database_path) / archive_options["Full"] for folder in os.listdir(str(full_path)): if os.path.isdir(folder): archive_dict[folder] = database_path / Path(folder) elif isinstance(option, list): for opt in option: other_path = Path(database_path) / archive_options[opt] archive_dict[opt] = other_path else: other_path = Path(database_path) / archive_options[option] archive_dict[option] = other_path for arch_name, data_path in archive_dict.items(): root_dir = str(data_path.parent) base_dir = str(data_path.stem) d = datetime.datetime.now().strftime(fmt="%Y-%m-%d_%H%M") output_pathname = archive_path / Path(arch_name + "." + d) # Archive the desired data. data_size = get_size(start_path=str(data_path)) archive_log.info("Archiving %s of data." % data_size) archive_filename = shutil.make_archive(base_name=str(output_pathname), format="xztar", root_dir=root_dir, base_dir=base_dir) archive_size = get_size(archive_filename) archive_log.warning("A %s archive file was created at %s." % (archive_filename, archive_size)) # TODO-ROB: Logging. And log to a README.md file. # Delete the files if desired. if delete_flag: archive_log.critical( "The original data will be deleted recursively at %s." % data_path) from OrthoEvol import OrthoEvolWarning OrthoEvolWarning( "You're about to delete your database (%s). Are you sure??" % data_path) shutil.rmtree(path=data_path) archive_list.append(str(archive_filename)) else: archive_log.critical( "The original data will be moved recursively from %s to %s." % (data_path, output_pathname)) output_pathname.mkdir() shutil.move(src=str(data_path), dst=str(output_pathname)) shutil.move(src=str(archive_filename), dst=str(output_pathname)) archive_list.append(str(output_pathname)) Path(data_path).mkdir(parents=True, exist_ok=True) return archive_list
class BaseSGEJob(object): """Base class for simple jobs.""" def __init__(self, base_jobname, config=None): """Initialize job attributes.""" self.base_jobname = base_jobname if not config: self.default_job_attributes = __DEFAULT__ else: self.default_job_attributes = config self.file2str = file2str self.sgejob_log = LogIt().default(logname="SGE JOB", logfile=None) self.pbsworkdir = os.getcwd() # Import the temp.pbs file using pkg_resources self.temp_pbs = resource_filename(templates.__name__, "temp.pbs") @classmethod def _configure(cls, length, base_jobname): """Configure job attributes or set it up. :param length: :param base_jobname: """ baseid, base = basejobids(length, base_jobname) return baseid, base def debug(self, code): """Debug the SGEJob. :param code: """ raise NotImplementedError def _cleanup(self, jobname): """Clean up job scripts. :param jobname: The name of the job being run or to be run. """ self.sgejob_log.warning('Your job will now be cleaned up.') os.remove(jobname + '.pbs') self.sgejob_log.warning('%s.pbs has been deleted.', jobname) os.remove(jobname + '.py') self.sgejob_log.warning('%s.py has been deleted.' % jobname) def wait_on_job_completion(self, job_id): """Use Qstat to monitor your job. :param job_id: The job id to be monitored. """ # TODO Allow either slack notifications or email or text. qwatch = Qstat().watch(job_id) if qwatch == 'Job id not found.': self.sgejob_log.info('%s has finished.' % job_id) sleep(30) elif qwatch == 'Waiting for %s to start running.' % job_id: self.sgejob_log.info('%s is queued to run.' % job_id) self.sgejob_log.info('Waiting for %s to start.' % job_id) sleep(30) self.wait_on_job_completion(job_id) elif qwatch == 'Waiting for %s to finish running.' % job_id: self.sgejob_log.info('%s is running.' % job_id) self.sgejob_log.info('Waiting for %s to finish.' % job_id) sleep(30) self.wait_on_job_completion(job_id) else: self.wait_on_job_completion(job_id) def submitjob(self, cleanup, wait=True): """Submit a job using qsub. :param cleanup: (Default value = False) :param wait: (Default value = True) """ try: cmd = ['qsub ' + self.jobname + '.pbs'] # this is the command # Shell MUST be True cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True) except CalledProcessError as err: self.sgejob_log.error(err.stderr.decode('utf-8')) if cleanup: self._cleanup(self.jobname) else: if cmd_status.returncode == 0: # Command was successful. # The cmd_status has stdout that must be decoded. # When a qsub job is submitted, the stdout is the job id. submitted_jobid = cmd_status.stdout.decode('utf-8') self.sgejob_log.info(self.jobname + ' was submitted.') self.sgejob_log.info('Your job id is: %s' % submitted_jobid) if wait is True: self.wait_on_job_completion(submitted_jobid) self._cleanup(self.jobname) else: # Unsuccessful. Stdout will be '1' self.sgejob_log.error('PBS job not submitted.')