def read(self, input_file, filetype="fasta", verbose=False): list_of_files = FileRoutines.make_list_of_path_to_files(input_file) for filename in list_of_files: if verbose: print("Parsing %s ..." % filename) directory, basename, extension = FileRoutines.split_filename(filename) try: self.records[basename] = MultipleAlignmentStatRecord(basename, alignment=AlignIO.read(filename, filetype)) self.record_id_list.append(basename) except: raise ValueError("ERROR: Issues while parsing or calculating stats for %s!!!" % filename) # collectiontype-dependent function pass
def parallel_positive_selection_test(self, in_dir, tree_file, out_dir, results_file, seq_type="codons", codon_frequency="F3X4", noisy=3, verbose="concise", runmode=0, clock=0, aminoacid_distance=None, genetic_code=0, fix_kappa=False, kappa=5, getSE=0, RateAncestor=0, small_difference=0.000001, clean_data=True, method=0): """ This function implements positive selection test (branch-site model) for branch labeled in tree file using model_A vs model_A_null(omega fixed to 1) comparison """ FileRoutines.safe_mkdir(out_dir) alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir) tree_file_abs_path = os.path.abspath(tree_file) options_list = [] dir_list = [] basename_dir_list = [] model_list = ["Model_A", "Model_A_null"] fix_omega_dict = {"Model_A": False, "Model_A_null": True} for filename in alignment_files_list: directory, basename, extension = FileRoutines.split_filename( filename) filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename)) basename_dir_list.append(basename) FileRoutines.safe_mkdir(filename_out_dir) for model in model_list: model_dir = "%s/%s/" % (filename_out_dir, model) FileRoutines.safe_mkdir(model_dir) out_file = "%s/%s/%s.out" % (filename_out_dir, model, basename) ctl_file = "%s/%s/%s.ctl" % (filename_out_dir, model, basename) options_list.append("%s.ctl" % basename) dir_list.append(model_dir) self.generate_ctl_file(os.path.abspath(filename), tree_file_abs_path, out_file, ctl_file, seq_type=seq_type, codon_frequency=codon_frequency, noisy=noisy, verbose=verbose, runmode=runmode, clock=clock, aminoacid_distance=aminoacid_distance, model=2, nssites=2, genetic_code=genetic_code, fix_kappa=fix_kappa, kappa=kappa, fix_omega=fix_omega_dict[model], omega=1, getSE=getSE, RateAncestor=RateAncestor, Mgene=0, small_difference=small_difference, clean_data=clean_data, method=method) self.parallel_execute(options_list, dir_list=dir_list) results_dict = OrderedDict() double_delta_dict = OrderedDict() raw_pvalues_dict = OrderedDict() raw_pvalues_list = [] for basename in basename_dir_list: results_dict[basename] = OrderedDict() for model in model_list: output_file = "%s/%s/%s/%s.out" % (out_dir, basename, model, basename) codeml_report = CodeMLReport(output_file) results_dict[basename][model] = codeml_report.LnL skipped_genes_set = set() for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) skipped_genes_set.add(basename) break else: doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 double_delta_dict[basename] = doubled_delta raw_pvalues_dict[basename] = p_value raw_pvalues_list.append(p_value) adjusted_pvalues_list = fdrcorrection0(raw_pvalues_list)[1] #print adjusted_pvalues_list i = 0 with open(results_file, "w") as out_fd: out_fd.write( "id\tmodel_a_null,LnL\tmodel_a,LnL\t2*delta\traw p-value\tadjusted p-value\n" ) for basename in basename_dir_list: for model in model_list: if results_dict[basename][model] is None: print("LnL was not calculated for %s" % basename) break else: #doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"]) #p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1 #print basename, results_dict[basename]["Model_A_null"],results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i] out_fd.write( "%s\t%f\t%f\t%f\t%f\t%f\n" % (basename, results_dict[basename]["Model_A_null"], results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i])) i += 1
def parallel_codeml(self, in_dir, tree_file, out_dir, seq_type="codons", codon_frequency="F3X4", noisy=0, verbose="concise", runmode=0, clock=0, aminoacid_distance=None, model=1, nssites=0, genetic_code=0, fix_kappa=False, kappa=5, fix_omega=False, omega=0.2, getSE=0, RateAncestor=0, small_difference=0.000001, clean_data=True, method=0, Mgene=None): FileRoutines.safe_mkdir(out_dir) alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir) tree_file_abs_path = os.path.abspath(tree_file) options_list = [] dir_list = [] for filename in alignment_files_list: directory, basename, extension = FileRoutines.split_filename( filename) filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename)) out_file = "%s/%s.out" % (filename_out_dir, basename) ctl_file = "%s/%s.ctl" % (filename_out_dir, basename) options_list.append(ctl_file) dir_list.append(filename_out_dir) FileRoutines.safe_mkdir(filename_out_dir) self.generate_ctl_file(os.path.abspath(filename), tree_file_abs_path, out_file, ctl_file, seq_type=seq_type, codon_frequency=codon_frequency, noisy=noisy, verbose=verbose, runmode=runmode, clock=clock, aminoacid_distance=aminoacid_distance, model=model, nssites=nssites, genetic_code=genetic_code, fix_kappa=fix_kappa, kappa=kappa, fix_omega=fix_omega, omega=omega, getSE=getSE, RateAncestor=RateAncestor, Mgene=Mgene, small_difference=small_difference, clean_data=clean_data, method=method) self.parallel_execute(options_list, dir_list=dir_list)