Beispiel #1
0
def finish_pasta_execution(pasta_team,
                          user_config,
                          temporaries_dir,
                          pasta_products,
			  multilocus_dataset):
    global _RunningJobs

    options = user_config.commandline

    user_config.save_to_filepath(os.path.join(temporaries_dir, 'last_used.cfg'))
    if options.timesfile:
        f = open_with_intermediates(options.timesfile, 'a')
        f.close()
        set_timing_log_filepath(options.timesfile)

    ############################################################################
    # Launch threads to do work
    #####
    pasta_config = user_config.get("sate")
    start_worker(pasta_config.num_cpus)
    
    
    #_LOG.debug("start reading the input alignment")
    #multilocus_dataset = read_input_sequences(user_config.input_seq_filepaths,
    #        datatype=user_config.commandline.datatype,
    #        missing=user_config.commandline.missing)
        
    ############################################################################
    # We must read the incoming tree in before we call the get_sequences_for_pasta
    #   function that relabels that taxa in the dataset
    ######
    alignment_as_tmp_filename_to_report = None
    tree_as_tmp_filename_to_report = None
    starting_tree = None
        
    tree_file = options.treefile
    if tree_file:
        if not os.path.exists(tree_file):
            raise Exception('The tree file "%s" does not exist' % tree_file)
        tree_f = open(tree_file, 'rU')
        MESSENGER.send_info('Reading starting trees from "%s"...' % tree_file)
        try:
            tree_list = read_and_encode_splits(multilocus_dataset.dataset, tree_f,
                    starting_tree=True)
        except KeyError:
            MESSENGER.send_error("Error in reading the treefile, probably due to a name in the tree that does not match the names in the input sequence files.\n")
            raise
        except:
            MESSENGER.send_error("Error in reading the treefile.\n")
            raise
        tree_f.close()
        if len(tree_list) > 1:
            MESSENGER.send_warning('%d starting trees found in "%s". The first tree will be used.' % (len(tree_list), tree_file))
        starting_tree = tree_list[0]
        score = None
        tree_as_tmp_filename_to_report = tree_file

    ############################################################################
    # This will relabel the taxa if they have problematic names
    #####
    multilocus_dataset.relabel_for_pasta()

    ############################################################################
    # This ensures all nucleotide data is DNA internally
    #####
    restore_to_rna = False
    if user_config.commandline.datatype.upper() == 'RNA':
        multilocus_dataset.convert_rna_to_dna()
        user_config.commandline.datatype = 'DNA'
        restore_to_rna = True

    export_names = True
    if export_names:
        try:
            name_filename = pasta_products.get_abs_path_for_tag('name_translation.txt')
            name_output = open(name_filename, 'w')
            safe2real = multilocus_dataset.safe_to_real_names
            safe_list = safe2real.keys()
            safe_list.sort()
            for safe in safe_list:
                orig = safe2real[safe][0]
                name_output.write("%s\n%s\n\n" % (safe, orig))
            name_output.close()
            MESSENGER.send_info("Name translation information saved to %s as safe name, original name, blank line format." % name_filename)
        except:
            MESSENGER.send_info("Error exporting saving name translation to %s" % name_filename)
            
    
    if options.aligned:
        options.aligned = all( [i.is_aligned() for i in multilocus_dataset] )

    ############################################################################
    # Be prepared to kill any long running jobs
    #####
    prev_signals = []
    for sig in [signal.SIGTERM, signal.SIGABRT, signal.SIGINT]: # signal.SIGABRT, signal.SIGBUS, signal.SIGINT, signal.SIGKILL, signal.SIGSTOP]:
        prev_handler = signal.signal(sig, killed_handler)
        prev_signals.append((sig, prev_handler))

    try:
        pasta_config_dict = pasta_config.dict()
        
        if (not options.two_phase) and tree_file:
            # getting the newick string here will allow us to get a string that is in terms of the correct taxon labels
            starting_tree_str = starting_tree.compose_newick()
        else:
            if not options.two_phase:
                MESSENGER.send_info("Creating a starting tree for the PASTA algorithm...")
            if (options.two_phase) or (not options.aligned):
                MESSENGER.send_info("Performing initial alignment of the entire data matrix...")
                init_aln_dir = os.path.join(temporaries_dir, 'init_aln')
                init_aln_dir = pasta_team.temp_fs.create_subdir(init_aln_dir)
                delete_aln_temps = not (options.keeptemp and options.keepalignmenttemps)
                aln_job_list = []
                query_fns = []
                for unaligned_seqs in multilocus_dataset:
                    #backbone = sorted(unaligned_seqs.keys())[0:100]
                    backbone = sample(unaligned_seqs.keys(), min(100,len(unaligned_seqs)))   
                    backbone_seqs = unaligned_seqs.sub_alignment(backbone)
                    
                    query_seq=list(set(unaligned_seqs.keys()) - set(backbone))
                    qn = len(query_seq)
                    chunks = min(int(4*pasta_config.num_cpus),int(ceil(qn/50.0)))
                    _LOG.debug("Will align the remaining %d sequences in %d chunks" %(qn,chunks))
                    for ch in xrange(0,chunks):
                        query_fn = os.path.join(init_aln_dir, "query-%d.fasta"%ch)
                        qa = unaligned_seqs.sub_alignment(query_seq[ch:qn:chunks])
                        _LOG.debug("Chunk with %d sequences built" %len(qa))
                        qa.write_filepath(query_fn)
                        query_fns.append(query_fn)
                    
                    
                    job = pasta_team.aligner.create_job(backbone_seqs,
                                                       tmp_dir_par=init_aln_dir,
                                                       context_str="initalign",
                                                       delete_temps=delete_aln_temps,
						       num_cpus=pasta_config.num_cpus)
                    aln_job_list.append(job)
                _RunningJobs = aln_job_list
                for job in aln_job_list:
                    jobq.put(job)
                
                new_alignment = compact(job.get_results())
                
                add_job_list = []
                for query_fn in query_fns:
                    job = pasta_team.hmmeralign.create_job(new_alignment, query_fn,
                                                        tmp_dir_par=init_aln_dir,
                                                        context_str="initalign",
                                                        delete_temps=delete_aln_temps)
                    add_job_list.append(job)
                _RunningJobs = None
                for job in add_job_list:
                    jobq.put(job)
                for job in add_job_list:
                    new_alignment.merge_in(compact(job.get_results()))
                    #new_alignment_list.apend(new_alignment)
                #for locus_index, new_alignment in enumerate(new_alignment_list):
                multilocus_dataset[0] = new_alignment
                
                if delete_aln_temps:
                    pasta_team.temp_fs.remove_dir(init_aln_dir)
            else:
                MESSENGER.send_info("Input sequences assumed to be aligned (based on sequence lengths).")

            MESSENGER.send_info("Performing initial tree search to get starting tree...")
            init_tree_dir = os.path.join(temporaries_dir, 'init_tree')
            init_tree_dir = pasta_team.temp_fs.create_subdir(init_tree_dir)
            delete_tree_temps = not options.keeptemp
            job = pasta_team.tree_estimator.create_job(multilocus_dataset,
                                                    tmp_dir_par=init_tree_dir,
                                                    num_cpus=pasta_config.num_cpus,
                                                    context_str="inittree",
                                                    delete_temps=delete_tree_temps,
                                                    pasta_products=pasta_products,
                                                    step_num='initialsearch',
                                                    mask_gappy_sites = pasta_config_dict['mask_gappy_sites'])
            _RunningJobs = job
            jobq.put(job)
            score, starting_tree_str = job.get_results()
            _RunningJobs = None
            alignment_as_tmp_filename_to_report = pasta_products.get_abs_path_for_iter_output("initialsearch", TEMP_SEQ_ALIGNMENT_TAG, allow_existing=True)
            tree_as_tmp_filename_to_report = pasta_products.get_abs_path_for_iter_output("initialsearch", TEMP_TREE_TAG, allow_existing=True)
            if delete_tree_temps:
                pasta_team.temp_fs.remove_dir(init_tree_dir)
        _LOG.debug('We have the tree and whole_alignment, partitions...')


        if options.keeptemp:
            pasta_config_dict['keep_iteration_temporaries'] = True
            if options.keepalignmenttemps:
                pasta_config_dict['keep_realignment_temporaries'] = True

        job = PastaJob(multilocus_dataset=multilocus_dataset,
                        pasta_team=pasta_team,
                        name=options.job,
                        status_messages=MESSENGER.send_info,
                        score=score,
                        **pasta_config_dict)
        if starting_tree is not None:            
            job.tree = generate_tree_with_splits_from_tree(starting_tree, force_fully_resolved = True)
        else:
            job.tree_str = starting_tree_str
        job.curr_iter_align_tmp_filename = alignment_as_tmp_filename_to_report
        job.curr_iter_tree_tmp_filename = tree_as_tmp_filename_to_report
        if score is not None:
            job.store_optimum_results(new_multilocus_dataset=multilocus_dataset,
                    new_tree_str=starting_tree_str,
                    new_score=score,
                    curr_timestamp=time.time())

        if options.two_phase:
            MESSENGER.send_info("Exiting with the initial tree because the PASTA algorithm is avoided when the --two-phase option is used.")
        else:
            _RunningJobs = job
            MESSENGER.send_info("Starting PASTA algorithm on initial tree...")
            job.run(tmp_dir_par=temporaries_dir, pasta_products=pasta_products)
            _RunningJobs = None

            if job.return_final_tree_and_alignment:
                alignment_as_tmp_filename_to_report = job.curr_iter_align_tmp_filename
            else:
                alignment_as_tmp_filename_to_report = job.best_alignment_tmp_filename
            
            if user_config.commandline.raxml_search_after:
                raxml_model = user_config.raxml.model.strip()
                if not raxml_model:
                    dt = user_config.commandline.datatype
                    mf = pasta_team.tree_estimator.model
                    ms =  fasttree_to_raxml_model_str(dt, mf)
                    pasta_team.raxml_tree_estimator.model = ms
                rte = pasta_team.raxml_tree_estimator
                MESSENGER.send_info("Performing post-processing tree search in RAxML...")
                post_tree_dir = os.path.join(temporaries_dir, 'post_tree')
                post_tree_dir = pasta_team.temp_fs.create_subdir(post_tree_dir)
                delete_tree_temps = not options.keeptemp
                starting_tree = None
                if user_config.sate.start_tree_search_from_current:
                    starting_tree = job.tree
                post_job = rte.create_job(job.multilocus_dataset,
                                    starting_tree=starting_tree,
                                    num_cpus=pasta_config.num_cpus,
                                    context_str="postraxtree",
                                    tmp_dir_par=post_tree_dir,
                                    delete_temps=delete_tree_temps,
                                    pasta_products=pasta_products,
                                    step_num="postraxtree",
                                    mask_gappy_sites = pasta_config_dict['mask_gappy_sites'])
                _RunningJobs = post_job
                jobq.put(post_job)
                post_score, post_tree = post_job.get_results()
                _RunningJobs = None
                tree_as_tmp_filename_to_report = pasta_products.get_abs_path_for_iter_output("postraxtree", TEMP_TREE_TAG, allow_existing=True)
                if delete_tree_temps:
                    pasta_team.temp_fs.remove_dir(post_tree_dir)
                job.tree_str = post_tree
                job.score = post_score
                if post_score > job.best_score:
                    job.best_tree_str = post_tree
                    job.best_score = post_score
            else:
                if job.return_final_tree_and_alignment:
                    tree_as_tmp_filename_to_report = job.curr_iter_tree_tmp_filename
                else:
                    tree_as_tmp_filename_to_report = job.best_tree_tmp_filename


        #######################################################################
        # Restore original taxon names and RNA characters
        #####
        job.multilocus_dataset.restore_taxon_names()
        if restore_to_rna:
            job.multilocus_dataset.convert_dna_to_rna()
            user_config.commandline.datatype = 'RNA'

        assert len(pasta_products.alignment_streams) == len(job.multilocus_dataset)
        for i, alignment in enumerate(job.multilocus_dataset):
            alignment_stream = pasta_products.alignment_streams[i]
            MESSENGER.send_info("Writing resulting alignment to %s" % alignment_stream.name)
            alignment.write(alignment_stream, file_format="FASTA")
            alignment_stream.close()


        MESSENGER.send_info("Writing resulting tree to %s" % pasta_products.tree_stream.name)
        tree_str = job.tree.compose_newick()
        pasta_products.tree_stream.write("%s;\n" % tree_str)


        #outtree_fn = options.result
        #if outtree_fn is None:
        #    if options.multilocus:
        #        outtree_fn = os.path.join(seqdir, "combined_%s.tre" % options.job)
        #    else:
        #        outtree_fn = aln_filename + ".tre"
        #MESSENGER.send_info("Writing resulting tree to %s" % outtree_fn)
        #tree_str = job.tree.compose_newick()
        #pasta_products.tree_stream.write("%s;\n" % tree_str)


        MESSENGER.send_info("Writing resulting likelihood score to %s" % pasta_products.score_stream.name)
        pasta_products.score_stream.write("%s\n" % job.score)
        
        if alignment_as_tmp_filename_to_report is not None:
            MESSENGER.send_info('The resulting alignment (with the names in a "safe" form) was first written as the file "%s"' % alignment_as_tmp_filename_to_report)
        if tree_as_tmp_filename_to_report is not None:
            MESSENGER.send_info('The resulting tree (with the names in a "safe" form) was first written as the file "%s"' % tree_as_tmp_filename_to_report)

    finally:      
        stop_worker()  
        for el in prev_signals:
            sig, prev_handler = el
            if prev_handler is None:
                signal.signal(sig, signal.SIG_DFL)
            else:
                signal.signal(sig, prev_handler)
Beispiel #2
0
def finish_pasta_execution(pasta_team, user_config, temporaries_dir,
                           pasta_products, multilocus_dataset):
    global _RunningJobs

    options = user_config.commandline

    user_config.save_to_filepath(os.path.join(temporaries_dir,
                                              'last_used.cfg'))
    if options.timesfile:
        f = open_with_intermediates(options.timesfile, 'a')
        f.close()
        set_timing_log_filepath(options.timesfile)

    ############################################################################
    # Launch threads to do work
    #####
    pasta_config = user_config.get("sate")
    start_worker(pasta_config.num_cpus)

    #_LOG.debug("start reading the input alignment")
    #multilocus_dataset = read_input_sequences(user_config.input_seq_filepaths,
    #        datatype=user_config.commandline.datatype,
    #        missing=user_config.commandline.missing)

    ############################################################################
    # We must read the incoming tree in before we call the get_sequences_for_pasta
    #   function that relabels that taxa in the dataset
    ######
    alignment_as_tmp_filename_to_report = None
    tree_as_tmp_filename_to_report = None
    starting_tree = None

    tree_file = options.treefile
    if tree_file:
        if not os.path.exists(tree_file):
            raise Exception('The tree file "%s" does not exist' % tree_file)
        tree_f = open(tree_file, 'rU')
        MESSENGER.send_info('Reading starting trees from "%s"...' % tree_file)
        try:
            tree_list = read_and_encode_splits(multilocus_dataset.dataset,
                                               tree_f,
                                               starting_tree=True)
        except KeyError:
            MESSENGER.send_error(
                "Error in reading the treefile, probably due to a name in the tree that does not match the names in the input sequence files.\n"
            )
            raise
        except:
            MESSENGER.send_error("Error in reading the treefile.\n")
            raise
        tree_f.close()
        if len(tree_list) > 1:
            MESSENGER.send_warning(
                '%d starting trees found in "%s". The first tree will be used.'
                % (len(tree_list), tree_file))
        starting_tree = tree_list[0]
        score = None
        tree_as_tmp_filename_to_report = tree_file

    ############################################################################
    # This will relabel the taxa if they have problematic names
    #####
    multilocus_dataset.relabel_for_pasta()

    ############################################################################
    # This ensures all nucleotide data is DNA internally
    #####
    restore_to_rna = False
    if user_config.commandline.datatype.upper() == 'RNA':
        multilocus_dataset.convert_rna_to_dna()
        user_config.commandline.datatype = 'DNA'
        restore_to_rna = True

    export_names = True
    if export_names:
        try:
            name_filename = pasta_products.get_abs_path_for_tag(
                'name_translation.txt')
            name_output = open(name_filename, 'w')
            safe2real = multilocus_dataset.safe_to_real_names
            safe_list = list(safe2real.keys())
            safe_list.sort()
            for safe in safe_list:
                orig = safe2real[safe][0]
                name_output.write("%s\n%s\n\n" % (safe, orig))
            name_output.close()
            MESSENGER.send_info(
                "Name translation information saved to %s as safe name, original name, blank line format."
                % name_filename)
        except:
            MESSENGER.send_info(
                "Error exporting saving name translation to %s" %
                name_filename)

    if options.aligned:
        options.aligned = all([i.is_aligned() for i in multilocus_dataset])

    ############################################################################
    # Be prepared to kill any long running jobs
    #####
    prev_signals = []
    for sig in [
            signal.SIGTERM, signal.SIGABRT, signal.SIGINT
    ]:  # signal.SIGABRT, signal.SIGBUS, signal.SIGINT, signal.SIGKILL, signal.SIGSTOP]:
        prev_handler = signal.signal(sig, killed_handler)
        prev_signals.append((sig, prev_handler))

    try:
        pasta_config_dict = pasta_config.dict()

        if (not options.two_phase) and tree_file:
            # getting the newick string here will allow us to get a string that is in terms of the correct taxon labels
            starting_tree_str = str(starting_tree)
        else:
            if not options.two_phase:
                MESSENGER.send_info(
                    "Creating a starting tree for the PASTA algorithm...")
            if (options.two_phase) or (not options.aligned):
                MESSENGER.send_info(
                    "Performing initial alignment of the entire data matrix..."
                )
                init_aln_dir = os.path.join(temporaries_dir, 'init_aln')
                init_aln_dir = pasta_team.temp_fs.create_subdir(init_aln_dir)
                delete_aln_temps = not (options.keeptemp
                                        and options.keepalignmenttemps)
                aln_job_list = []
                query_fns = []
                for unaligned_seqs in multilocus_dataset:
                    #backbone = sorted(unaligned_seqs.keys())[0:100]
                    backbone = sample(list(unaligned_seqs.keys()),
                                      min(100, len(unaligned_seqs)))
                    backbone_seqs = unaligned_seqs.sub_alignment(backbone)

                    query_seq = list(
                        set(unaligned_seqs.keys()) - set(backbone))
                    qn = len(query_seq)
                    chunks = min(int(4 * pasta_config.num_cpus),
                                 int(ceil(qn / 50.0)))
                    _LOG.debug(
                        "Will align the remaining %d sequences in %d chunks" %
                        (qn, chunks))
                    for ch in range(0, chunks):
                        query_fn = os.path.join(init_aln_dir,
                                                "query-%d.fasta" % ch)
                        qa = unaligned_seqs.sub_alignment(
                            query_seq[ch:qn:chunks])
                        _LOG.debug("Chunk with %d sequences built" % len(qa))
                        qa.write_filepath(query_fn)
                        query_fns.append(query_fn)

                    job = pasta_team.aligner.create_job(
                        backbone_seqs,
                        tmp_dir_par=init_aln_dir,
                        context_str="initalign",
                        delete_temps=delete_aln_temps,
                        num_cpus=pasta_config.num_cpus)
                    aln_job_list.append(job)
                _RunningJobs = aln_job_list
                for job in aln_job_list:
                    jobq.put(job)

                new_alignment = compact(job.get_results())

                add_job_list = []
                for query_fn in query_fns:
                    job = pasta_team.hmmeralign.create_job(
                        new_alignment,
                        query_fn,
                        tmp_dir_par=init_aln_dir,
                        context_str="initalign",
                        delete_temps=delete_aln_temps)
                    add_job_list.append(job)
                _RunningJobs = None
                for job in add_job_list:
                    jobq.put(job)
                for job in add_job_list:
                    new_alignment.merge_in(compact(job.get_results()))
                    #new_alignment_list.apend(new_alignment)
                #for locus_index, new_alignment in enumerate(new_alignment_list):
                multilocus_dataset[0] = new_alignment

                if delete_aln_temps:
                    pasta_team.temp_fs.remove_dir(init_aln_dir)
            else:
                MESSENGER.send_info(
                    "Input sequences assumed to be aligned (based on sequence lengths)."
                )

            MESSENGER.send_info(
                "Performing initial tree search to get starting tree...")
            init_tree_dir = os.path.join(temporaries_dir, 'init_tree')
            init_tree_dir = pasta_team.temp_fs.create_subdir(init_tree_dir)
            delete_tree_temps = not options.keeptemp
            job = pasta_team.tree_estimator.create_job(
                multilocus_dataset,
                tmp_dir_par=init_tree_dir,
                num_cpus=pasta_config.num_cpus,
                context_str="inittree",
                delete_temps=delete_tree_temps,
                pasta_products=pasta_products,
                step_num='initialsearch',
                mask_gappy_sites=pasta_config_dict['mask_gappy_sites'])
            _RunningJobs = job
            jobq.put(job)
            score, starting_tree_str = job.get_results()
            _RunningJobs = None
            alignment_as_tmp_filename_to_report = pasta_products.get_abs_path_for_iter_output(
                "initialsearch", TEMP_SEQ_ALIGNMENT_TAG, allow_existing=True)
            tree_as_tmp_filename_to_report = pasta_products.get_abs_path_for_iter_output(
                "initialsearch", TEMP_TREE_TAG, allow_existing=True)
            if delete_tree_temps:
                pasta_team.temp_fs.remove_dir(init_tree_dir)
        _LOG.debug('We have the tree and whole_alignment, partitions...')

        if options.keeptemp:
            pasta_config_dict['keep_iteration_temporaries'] = True
            if options.keepalignmenttemps:
                pasta_config_dict['keep_realignment_temporaries'] = True

        job = PastaJob(multilocus_dataset=multilocus_dataset,
                       pasta_team=pasta_team,
                       name=options.job,
                       status_messages=MESSENGER.send_info,
                       score=score,
                       **pasta_config_dict)
        if starting_tree is not None:
            job.tree = generate_tree_with_splits_from_tree(
                starting_tree, force_fully_resolved=True)
        else:
            job.tree_str = starting_tree_str
        job.curr_iter_align_tmp_filename = alignment_as_tmp_filename_to_report
        job.curr_iter_tree_tmp_filename = tree_as_tmp_filename_to_report
        if score is not None:
            job.store_optimum_results(
                new_multilocus_dataset=multilocus_dataset,
                new_tree_str=starting_tree_str,
                new_score=score,
                curr_timestamp=time.time())

        if options.two_phase:
            MESSENGER.send_info(
                "Exiting with the initial tree because the PASTA algorithm is avoided when the --two-phase option is used."
            )
        else:
            _RunningJobs = job
            MESSENGER.send_info("Starting PASTA algorithm on initial tree...")
            job.run(tmp_dir_par=temporaries_dir, pasta_products=pasta_products)
            _RunningJobs = None

            if job.return_final_tree_and_alignment:
                alignment_as_tmp_filename_to_report = job.curr_iter_align_tmp_filename
            else:
                alignment_as_tmp_filename_to_report = job.best_alignment_tmp_filename

            if user_config.commandline.raxml_search_after:
                raxml_model = user_config.raxml.model.strip()
                if not raxml_model:
                    dt = user_config.commandline.datatype
                    mf = pasta_team.tree_estimator.model
                    ms = fasttree_to_raxml_model_str(dt, mf)
                    pasta_team.raxml_tree_estimator.model = ms
                rte = pasta_team.raxml_tree_estimator
                MESSENGER.send_info(
                    "Performing post-processing tree search in RAxML...")
                post_tree_dir = os.path.join(temporaries_dir, 'post_tree')
                post_tree_dir = pasta_team.temp_fs.create_subdir(post_tree_dir)
                delete_tree_temps = not options.keeptemp
                starting_tree = None
                if user_config.sate.start_tree_search_from_current:
                    starting_tree = job.tree
                post_job = rte.create_job(
                    job.multilocus_dataset,
                    starting_tree=starting_tree,
                    num_cpus=pasta_config.num_cpus,
                    context_str="postraxtree",
                    tmp_dir_par=post_tree_dir,
                    delete_temps=delete_tree_temps,
                    pasta_products=pasta_products,
                    step_num="postraxtree",
                    mask_gappy_sites=pasta_config_dict['mask_gappy_sites'])
                _RunningJobs = post_job
                jobq.put(post_job)
                post_score, post_tree = post_job.get_results()
                _RunningJobs = None
                tree_as_tmp_filename_to_report = pasta_products.get_abs_path_for_iter_output(
                    "postraxtree", TEMP_TREE_TAG, allow_existing=True)
                if delete_tree_temps:
                    pasta_team.temp_fs.remove_dir(post_tree_dir)
                job.tree_str = post_tree
                job.score = post_score
                if post_score > job.best_score:
                    job.best_tree_str = post_tree
                    job.best_score = post_score
            else:
                if job.return_final_tree_and_alignment:
                    tree_as_tmp_filename_to_report = job.curr_iter_tree_tmp_filename
                else:
                    tree_as_tmp_filename_to_report = job.best_tree_tmp_filename

        #######################################################################
        # Restore original taxon names and RNA characters
        #####
        job.multilocus_dataset.restore_taxon_names()
        if restore_to_rna:
            job.multilocus_dataset.convert_dna_to_rna()
            user_config.commandline.datatype = 'RNA'

        assert len(pasta_products.alignment_streams) == len(
            job.multilocus_dataset)
        for i, alignment in enumerate(job.multilocus_dataset):
            alignment_stream = pasta_products.alignment_streams[i]
            MESSENGER.send_info("Writing resulting alignment to %s" %
                                alignment_stream.name)
            alignment.write(alignment_stream, file_format="FASTA")
            alignment_stream.close()

        MESSENGER.send_info("Writing resulting tree to %s" %
                            pasta_products.tree_stream.name)
        tree_str = job.tree.compose_newick()
        pasta_products.tree_stream.write("%s;\n" % tree_str)
        pasta_products.tree_stream.close()

        #outtree_fn = options.result
        #if outtree_fn is None:
        #        outtree_fn = os.path.join(seqdir, "combined_%s.tre" % options.job)
        #    else:
        #        outtree_fn = aln_filename + ".tre"
        #MESSENGER.send_info("Writing resulting tree to %s" % outtree_fn)
        #tree_str = str(job.tree)
        #pasta_products.tree_stream.write("%s;\n" % tree_str)

        MESSENGER.send_info("Writing resulting likelihood score to %s" %
                            pasta_products.score_stream.name)
        pasta_products.score_stream.write("%s\n" % job.score)
        pasta_products.score_stream.close()

        if alignment_as_tmp_filename_to_report is not None:
            MESSENGER.send_info(
                'The resulting alignment (with the names in a "safe" form) was first written as the file "%s"'
                % alignment_as_tmp_filename_to_report)
        if tree_as_tmp_filename_to_report is not None:
            MESSENGER.send_info(
                'The resulting tree (with the names in a "safe" form) was first written as the file "%s"'
                % tree_as_tmp_filename_to_report)

    finally:
        stop_worker()
        for el in prev_signals:
            sig, prev_handler = el
            if prev_handler is None:
                signal.signal(sig, signal.SIG_DFL)
            else:
                signal.signal(sig, prev_handler)
Beispiel #3
0
def finish_pasta_execution(pasta_team,
                          user_config,
                          temporaries_dir,
                          pasta_products,
			  multilocus_dataset):
    global _RunningJobs

    # Method created via pycharm refactoring
    # print "running pasta_prelim_step1" #DEBUG
    alignment_as_tmp_filename_to_report, options, pasta_config, score, starting_tree, tree_as_tmp_filename_to_report, tree_file = pasta_prelim_step1(
        multilocus_dataset, temporaries_dir, user_config)

    # Method created via pycharm refactoring
    # print "running pasta_prelim_step2" #DEBUG
    prev_signals, restore_to_rna = pasta_prelim_step2(multilocus_dataset, options, pasta_products, user_config)

    try:
        # Method created via pycharm refactoring
        # print "running pasta_prelim_step3" #DEBUG
        alignment_as_tmp_filename_to_report, pasta_config_dict, score, starting_tree_str, tree_as_tmp_filename_to_report = pasta_prelim_step3(
            multilocus_dataset, options, pasta_config, pasta_products, pasta_team, score,
            alignment_as_tmp_filename_to_report, starting_tree, temporaries_dir, tree_as_tmp_filename_to_report,
            tree_file)


        if options.keeptemp:
            pasta_config_dict['keep_iteration_temporaries'] = True
            if options.keepalignmenttemps:
                pasta_config_dict['keep_realignment_temporaries'] = True

        if options.interruptible==False and options.resume_state_path==None:
            job = PastaJob(multilocus_dataset=multilocus_dataset,
                            pasta_team=pasta_team,
                            name=options.job,
                            status_messages=MESSENGER.send_info,
                            score=score,
                            **pasta_config_dict)
            if starting_tree is not None:
                job.tree = generate_tree_with_splits_from_tree(starting_tree, force_fully_resolved = True)
            else:
                job.tree_str = starting_tree_str
            job.curr_iter_align_tmp_filename = alignment_as_tmp_filename_to_report
            job.curr_iter_tree_tmp_filename = tree_as_tmp_filename_to_report
            if score is not None:
                job.store_optimum_results(new_multilocus_dataset=multilocus_dataset,
                        new_tree_str=starting_tree_str,
                        new_score=score,
                        curr_timestamp=time.time())
        elif options.interruptible==True and options.resume_state_path==None:
            job = PastaInterruptibleJob(multilocus_dataset=multilocus_dataset,
                            pasta_team=pasta_team,
                            name=options.job,
                            status_messages=MESSENGER.send_info,
                            score=score,
                            **pasta_config_dict)
            if starting_tree is not None:
                job.tree = generate_tree_with_splits_from_tree(starting_tree, force_fully_resolved = True)
            else:
                job.tree_str = starting_tree_str
            job.curr_iter_align_tmp_filename = alignment_as_tmp_filename_to_report
            job.curr_iter_tree_tmp_filename = tree_as_tmp_filename_to_report
            if score is not None:
                job.store_optimum_results(new_multilocus_dataset=multilocus_dataset,
                        new_tree_str=starting_tree_str,
                        new_score=score,
                        curr_timestamp=time.time())
        else:
            import pickle
            pickledpastajob=open(options.resume_state_path,'rb')
            job=pickle.load(pickledpastajob)
            job.restore_non_pickled(status_messages=MESSENGER.send_info)
            pickledpastajob.close()

        if options.two_phase:
            MESSENGER.send_info("Exiting with the initial tree because the PASTA algorithm is avoided when the --two-phase option is used.")
        else:
            _RunningJobs = job
            if options.interruptible==False or options.resume_state_path==None:
                MESSENGER.send_info("Starting PASTA algorithm on initial tree...")
            else:
                MESSENGER.send_info("Resuming PASTA from state file %s" % options.resume_state_path)

            # The heavy-lifting line:
            aln_list, resumable= job.run(tmp_dir_par=temporaries_dir, pasta_products=pasta_products)
            if resumable==True:
                picklepath=job.pasta_products.get_abs_path_for_iter_output( job.current_iteration, 'picklefile', allow_existing=False)
                MESSENGER.send_info("Pickling PastaJob to file:\n%s\nPlease run the alignment jobs in the following comma-delimited file and resume:%s\n" % (picklepath, aln_list))
                job.clean_for_pickling()
                import pickle
                pf=open(picklepath,'wb')
                # tempf=open('/projects/tallis/nute/work/baliphy-pseudo/test-interupt/test-job-cts.txt','w')
                # tempf.write(str(job.__dict__))
                # tempf.close()
                # print job.__dict__.keys()
                # print get_pickling_errors(job)
                # pdb.set_trace() #DEBUG
                pickle.dump(job,pf)
                pf.close()
                sys.exit(0)
            _RunningJobs = None

            # Method created via pycharm refactoring
            alignment_as_tmp_filename_to_report, tree_as_tmp_filename_to_report = pasta_postproc_step1(
                alignment_as_tmp_filename_to_report, job, options, pasta_config, pasta_config_dict, pasta_products,
                pasta_team, temporaries_dir, tree_as_tmp_filename_to_report, user_config)

        # Method created via pycharm refactoring
        pasta_postproc_step2(alignment_as_tmp_filename_to_report, job, pasta_products, restore_to_rna,
                             tree_as_tmp_filename_to_report, user_config)

    finally:      
        stop_worker()  
        for el in prev_signals:
            sig, prev_handler = el
            if prev_handler is None:
                signal.signal(sig, signal.SIG_DFL)
            else:
                signal.signal(sig, prev_handler)