예제 #1
0
class ETE3PAML(object):
    """Use ETE3's M1 model to run PAML's codeml for orthology inference."""
    def __init__(self, alignmentfile, speciestree, workdir=''):
        """Initialize main variables/files to be used.

        :param alignmentfile:  Input alignment file in fasta format
        :param speciestree:  A newick formatted species tree.
        :param workdir:  Directory of alignment file and species tree.
                         (Default value = '')
        """
        self.ete3paml_log = LogIt().default(logname="ete3paml", logfile=None)
        self.alignmentfile = alignmentfile
        self.speciestree = speciestree
        self.workdir = workdir

        # Import your species tree
        self._speciestree = Tree(self.speciestree, format=1)
        # TODO import organisms list

        # Import alignment file as string
        alignment_file = open(self.alignmentfile, 'r')
        alignment_str = alignment_file.read()
        self.aln_str = alignment_str
        alignment_file.close()

    def prune_tree(self, organisms):
        """Prune branches for species not in the alignment file.

        Keep branches in the species tree for species in the alignment file
        Some species may not be present in the alignment file due to lack of
        matching with blast or simply the gene not being in the genome.

        :param organisms: A list of organisms in the alignment file.
        """

        if os.path.isfile(organisms):
            organismslist = csvtolist(organisms)
        else:
            organismslist = organisms

        branches2keep = []
        for organism in organismslist:
            if organism in self.aln_str:
                branches2keep.append(organism)
            else:
                self.ete3paml_log.warning('No sequence for %s.' % organism)

            self._speciestree.prune(branches2keep, preserve_branch_length=True)

            # Write the tree to a file
            self._speciestree.write(
                outfile=os.path.join(self.workdir, 'temptree.nw'))
            self.ete3paml_log.info('temptree.nw was created.')

    def run(self, pamlsrc, output_folder, model='M1'):
        """Run PAML using ETE.

        The default model is M1 as it is best for orthology inference in
        our case. You can use models `M2`, `M0`, `M3`.

        Ensure that you have the correct path to your codeml binary. It should
        be in the paml `/bin`.

        :param pamlsrc: Path to the codemly binary.
        :param output_folder: The name of the output folder.
        :param model: The model to be used. (Default value = 'M1')
        """

        # Import the newick tree
        tree = EvolTree('temptree.nw')

        # Import the alignment
        tree.link_to_alignment(self.alignmentfile)

        tree.workdir = self.workdir

        # Set the binpath of the codeml binary
        tree.execpath = pamlsrc
        # Run the model M1, M2, M3, or M0
        model_path = model + '.' + output_folder
        tree.run_model(model_path)
        self.ete3paml_log.info('Codeml is generating data in %s.' % model_path)
예제 #2
0
def archive(database_path, archive_path, option, delete_flag=False):
    """Archive a database directory from a Cookie templated directory structure.

    This utility creates a YAML config dictionary that contains path-like
    objects for archiving.  The original data
    can be moved to the archive path or deleted all together.

    :param database_path:  A path to a folder that consists of the desired data.
    :param archive_path:  A path to an output folder for archived data.
    :param option:  An option for the archiving strategy.  Will be one of the
                    keys in the archive_options.
    :param delete_flag:  A flag for deleting the original data.  USE WITH CAUTION.
    :return:  Returns a list of paths to the *.tar.xz archive of the data
              and/or a path to the original data.

    """

    archive_dict = {}
    archive_list = []
    archive_log = LogIt().default(logname="Archive", logfile=None)

    if option == "Full":
        full_path = Path(database_path) / archive_options["Full"]
        for folder in os.listdir(str(full_path)):
            if os.path.isdir(folder):
                archive_dict[folder] = database_path / Path(folder)
    elif isinstance(option, list):
        for opt in option:
            other_path = Path(database_path) / archive_options[opt]
            archive_dict[opt] = other_path
    else:
        other_path = Path(database_path) / archive_options[option]
        archive_dict[option] = other_path

    for arch_name, data_path in archive_dict.items():
        root_dir = str(data_path.parent)
        base_dir = str(data_path.stem)
        d = datetime.datetime.now().strftime(fmt="%Y-%m-%d_%H%M")
        output_pathname = archive_path / Path(arch_name + "." + d)
        # Archive the desired data.
        data_size = get_size(start_path=str(data_path))
        archive_log.info("Archiving %s of data." % data_size)
        archive_filename = shutil.make_archive(base_name=str(output_pathname),
                                               format="xztar",
                                               root_dir=root_dir,
                                               base_dir=base_dir)
        archive_size = get_size(archive_filename)
        archive_log.warning("A %s archive file was created at %s." %
                            (archive_filename, archive_size))
        # TODO-ROB:  Logging.  And log to a README.md file.
        # Delete the files if desired.
        if delete_flag:
            archive_log.critical(
                "The original data will be deleted recursively at %s." %
                data_path)
            from OrthoEvol import OrthoEvolWarning
            OrthoEvolWarning(
                "You're about to delete your database (%s).  Are you sure??" %
                data_path)
            shutil.rmtree(path=data_path)
            archive_list.append(str(archive_filename))
        else:
            archive_log.critical(
                "The original data will be moved recursively from %s to %s." %
                (data_path, output_pathname))
            output_pathname.mkdir()
            shutil.move(src=str(data_path), dst=str(output_pathname))
            shutil.move(src=str(archive_filename), dst=str(output_pathname))
            archive_list.append(str(output_pathname))

        Path(data_path).mkdir(parents=True, exist_ok=True)
    return archive_list
예제 #3
0
class BaseSGEJob(object):
    """Base class for simple jobs."""
    def __init__(self, base_jobname, config=None):
        """Initialize job attributes."""
        self.base_jobname = base_jobname
        if not config:
            self.default_job_attributes = __DEFAULT__
        else:
            self.default_job_attributes = config
        self.file2str = file2str
        self.sgejob_log = LogIt().default(logname="SGE JOB", logfile=None)
        self.pbsworkdir = os.getcwd()

        # Import the temp.pbs file using pkg_resources
        self.temp_pbs = resource_filename(templates.__name__, "temp.pbs")

    @classmethod
    def _configure(cls, length, base_jobname):
        """Configure job attributes or set it up.

        :param length:
        :param base_jobname:
        """

        baseid, base = basejobids(length, base_jobname)
        return baseid, base

    def debug(self, code):
        """Debug the SGEJob.

        :param code:
        """

        raise NotImplementedError

    def _cleanup(self, jobname):
        """Clean up job scripts.

        :param jobname: The name of the job being run or to be run.
        """

        self.sgejob_log.warning('Your job will now be cleaned up.')
        os.remove(jobname + '.pbs')
        self.sgejob_log.warning('%s.pbs has been deleted.', jobname)
        os.remove(jobname + '.py')
        self.sgejob_log.warning('%s.py has been deleted.' % jobname)

    def wait_on_job_completion(self, job_id):
        """Use Qstat to monitor your job.

        :param job_id: The job id to be monitored.
        """

        # TODO Allow either slack notifications or email or text.
        qwatch = Qstat().watch(job_id)
        if qwatch == 'Job id not found.':
            self.sgejob_log.info('%s has finished.' % job_id)
            sleep(30)
        elif qwatch == 'Waiting for %s to start running.' % job_id:
            self.sgejob_log.info('%s is queued to run.' % job_id)
            self.sgejob_log.info('Waiting for %s to start.' % job_id)
            sleep(30)
            self.wait_on_job_completion(job_id)
        elif qwatch == 'Waiting for %s to finish running.' % job_id:
            self.sgejob_log.info('%s is running.' % job_id)
            self.sgejob_log.info('Waiting for %s to finish.' % job_id)
            sleep(30)
            self.wait_on_job_completion(job_id)
        else:
            self.wait_on_job_completion(job_id)

    def submitjob(self, cleanup, wait=True):
        """Submit a job using qsub.
        
        :param cleanup: (Default value = False)
        :param wait: (Default value = True)
        """
        try:
            cmd = ['qsub ' + self.jobname + '.pbs']  # this is the command
            # Shell MUST be True
            cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True)
        except CalledProcessError as err:
            self.sgejob_log.error(err.stderr.decode('utf-8'))
            if cleanup:
                self._cleanup(self.jobname)
        else:
            if cmd_status.returncode == 0:  # Command was successful.
                # The cmd_status has stdout that must be decoded.
                # When a qsub job is submitted, the stdout is the job id.
                submitted_jobid = cmd_status.stdout.decode('utf-8')
                self.sgejob_log.info(self.jobname + ' was submitted.')
                self.sgejob_log.info('Your job id is: %s' % submitted_jobid)
                if wait is True:
                    self.wait_on_job_completion(submitted_jobid)
                    self._cleanup(self.jobname)

            else:  # Unsuccessful. Stdout will be '1'
                self.sgejob_log.error('PBS job not submitted.')