def single_cluster_analysis(cluster_id, cluster_folder, output_folder, temp_folder, outfile_notfound): """ This function take the group, alignment, tree and folder information and runs a paml analysis using the M1a, M2a, M7 and M8 models The working dir is important (different from the output dir), because different PAML runs at the same time may override each other. This is particularly important if running this script in more than one processor """ from SelectionAnalysis import paml_stats from SelectionAnalysis import paml_prepare from SelectionAnalysis import paml_run cluster_file = cluster_folder + "/" + cluster_id + ".fna" # Add fna extension #Check that the cluster file exists, if not continue if not os.path.exists(cluster_file): outfile_notfound.write(cluster_id + "\n") #Make a new tree, no confidence values in the branches new_tree = paml_prepare.run_fasttree(cluster_file, temp_folder) #Make the new alignment, and get information about the alignment new_alignment_file, number_sequences, alignment_length = \ paml_prepare.adjust_alignment(cluster_file, temp_folder) #Run PAML for each branch in the cluster with both models paml_sites_results = paml_run.paml_sites(new_alignment_file, new_tree, output_folder, temp_folder) #Calculate pvalue pvalue_m1_m2 = paml_stats.lrt(paml_sites_results[1].get("lnL"), paml_sites_results[2].get("lnL"), 2) pvalue_m7_m8 = paml_stats.lrt(paml_sites_results[7].get("lnL"), paml_sites_results[8].get("lnL"), 2) #Store the omega and proportion of sites,based on the M8 model try: proportion_sites = float( paml_sites_results[8]["site_classes"][10]["proportion"]) omega_value = float(paml_sites_results[8]["site_classes"][10]["omega"]) except TypeError: proportion_sites = 0 omega_value = 0 #Store final results summary_results = [ cluster_id, number_sequences, alignment_length, round(pvalue_m1_m2, 3), round(pvalue_m7_m8, 3), proportion_sites, omega_value ] print summary_results return summary_results
def single_cluster_analysis(cluster_id, cluster_folder, output_folder, temp_folder, outfile_notfound): """ This function take the group, alignment, tree and folder information and runs a paml analysis using the M1a, M2a, M7 and M8 models The working dir is important (different from the output dir), because different PAML runs at the same time may override each other. This is particularly important if running this script in more than one processor """ from SelectionAnalysis import paml_stats from SelectionAnalysis import paml_prepare from SelectionAnalysis import paml_run cluster_file = cluster_folder + "/" + cluster_id + ".fna" # Add fna extension #Check that the cluster file exists, if not continue if not os.path.exists(cluster_file): outfile_notfound.write(cluster_id + "\n") #Make a new tree, no confidence values in the branches new_tree = paml_prepare.run_fasttree(cluster_file, temp_folder) #Make the new alignment, and get information about the alignment new_alignment_file, number_sequences, alignment_length = \ paml_prepare.adjust_alignment(cluster_file, temp_folder) #Run PAML for each branch in the cluster with both models paml_sites_results = paml_run.paml_sites(new_alignment_file, new_tree, output_folder, temp_folder) #Calculate pvalue pvalue_m1_m2 = paml_stats.lrt(paml_sites_results[1].get("lnL"), paml_sites_results[2].get("lnL"), 2) pvalue_m7_m8 = paml_stats.lrt(paml_sites_results[7].get("lnL"), paml_sites_results[8].get("lnL"), 2) #Store the omega and proportion of sites,based on the M8 model try: proportion_sites = float(paml_sites_results[8]["site_classes"][10]["proportion"]) omega_value = float(paml_sites_results[8]["site_classes"][10]["omega"]) except TypeError: proportion_sites = 0 omega_value = 0 #Store final results summary_results = [cluster_id, number_sequences, alignment_length, round(pvalue_m1_m2, 3), round(pvalue_m7_m8, 3), proportion_sites, omega_value] print summary_results return summary_results
def cluster_analysis(cluster_list, cluster_folder, group_branches, output_folder, temporal_folder, results, no_data, not_found): """ Function used to run the analysis on the cluster list. It will run PAML for each group, and then it will calculate the stats """ from SelectionAnalysis import paml_stats from SelectionAnalysis import paml_prepare for cluster in cluster_list: cluster_file = cluster_folder + "/" + cluster + ".fna" # Add fna extension #Check that the cluster file exists, if not continue if not os.path.exists(cluster_file): not_found.append(cluster) continue #Make a new tree, no confidence values in the branches new_tree = paml_prepare.run_fasttree(cluster_file, temporal_folder) #Make the new alignment, and get information about the alignment new_alignment_file, number_sequences, alignment_length = paml_prepare.adjust_alignment( cluster_file, temporal_folder) #Run PAML for each branch in the cluster with both models paml_site_branch_results = run_paml_per_group(group_branches, new_alignment_file, new_tree, output_folder, temporal_folder) for group in paml_site_branch_results: #Store those clusters and groups that were not analyzed if paml_site_branch_results[group] is None: no_data.append([cluster, group]) else: pvalue = paml_stats.lrt( paml_site_branch_results[group]["Ma"].get("lnL"), paml_site_branch_results[group]["M1a"].get("lnL"), 1) proportion_sites = float(paml_site_branch_results[group]["Ma"]["site_classes"][2]["proportion"]) + \ float(paml_site_branch_results[group]["Ma"]["site_classes"][3]["proportion"]) average_omega = ( float(paml_site_branch_results[group]["Ma"]["site_classes"] [2]["branch types"]["foreground"]) + float(paml_site_branch_results[group]["Ma"]["site_classes"] [3]["branch types"]["foreground"])) / 2 #Store the final results #Group, Nseqs, Length, p-value, P1 in Ma, Omega in W results.append([ cluster, group, number_sequences, alignment_length, round(pvalue, 3), proportion_sites, average_omega ])
def cluster_analysis(cluster_list, cluster_folder, group_branches, output_folder, temporal_folder, results, no_data, not_found): """ Function used to run the analysis on the cluster list. It will run PAML for each group, and then it will calculate the stats """ from SelectionAnalysis import paml_stats from SelectionAnalysis import paml_prepare for cluster in cluster_list: cluster_file = cluster_folder + "/" + cluster + ".fna" # Add fna extension #Check that the cluster file exists, if not continue if not os.path.exists(cluster_file): not_found.append(cluster) continue #Make a new tree, no confidence values in the branches new_tree = paml_prepare.run_fasttree(cluster_file, temporal_folder) #Make the new alignment, and get information about the alignment new_alignment_file, number_sequences, alignment_length = paml_prepare.adjust_alignment(cluster_file, temporal_folder) #Run PAML for each branch in the cluster with both models paml_site_branch_results = run_paml_per_group(group_branches, new_alignment_file, new_tree, output_folder, temporal_folder) for group in paml_site_branch_results: #Store those clusters and groups that were not analyzed if paml_site_branch_results[group] is None: no_data.append([cluster, group]) else: pvalue = paml_stats.lrt(paml_site_branch_results[group]["Ma"].get("lnL"), paml_site_branch_results[group]["M1a"].get("lnL"), 1) proportion_sites = float(paml_site_branch_results[group]["Ma"]["site_classes"][2]["proportion"]) + \ float(paml_site_branch_results[group]["Ma"]["site_classes"][3]["proportion"]) average_omega = (float(paml_site_branch_results[group]["Ma"]["site_classes"][2]["branch types"]["foreground"]) + float(paml_site_branch_results[group]["Ma"]["site_classes"][3]["branch types"]["foreground"])) / 2 #Store the final results #Group, Nseqs, Length, p-value, P1 in Ma, Omega in W results.append([cluster, group, number_sequences, alignment_length, round(pvalue, 3), proportion_sites, average_omega])