def read(self, input_file, filetype="fasta", verbose=False):
     list_of_files = FileRoutines.make_list_of_path_to_files(input_file)
     for filename in list_of_files:
         if verbose:
             print("Parsing %s ..." % filename)
         directory, basename, extension = FileRoutines.split_filename(filename)
         try:
             self.records[basename] = MultipleAlignmentStatRecord(basename, alignment=AlignIO.read(filename, filetype))
             self.record_id_list.append(basename)
         except:
             raise ValueError("ERROR: Issues while parsing or calculating stats for %s!!!" % filename)
     # collectiontype-dependent function
     pass
Beispiel #2
0
    def parallel_positive_selection_test(self,
                                         in_dir,
                                         tree_file,
                                         out_dir,
                                         results_file,
                                         seq_type="codons",
                                         codon_frequency="F3X4",
                                         noisy=3,
                                         verbose="concise",
                                         runmode=0,
                                         clock=0,
                                         aminoacid_distance=None,
                                         genetic_code=0,
                                         fix_kappa=False,
                                         kappa=5,
                                         getSE=0,
                                         RateAncestor=0,
                                         small_difference=0.000001,
                                         clean_data=True,
                                         method=0):
        """
        This function implements positive selection test (branch-site model)
        for branch labeled in tree file using model_A vs model_A_null(omega fixed to 1) comparison
        """

        FileRoutines.safe_mkdir(out_dir)
        alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir)
        tree_file_abs_path = os.path.abspath(tree_file)
        options_list = []
        dir_list = []
        basename_dir_list = []
        model_list = ["Model_A", "Model_A_null"]
        fix_omega_dict = {"Model_A": False, "Model_A_null": True}

        for filename in alignment_files_list:
            directory, basename, extension = FileRoutines.split_filename(
                filename)
            filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename))
            basename_dir_list.append(basename)
            FileRoutines.safe_mkdir(filename_out_dir)

            for model in model_list:
                model_dir = "%s/%s/" % (filename_out_dir, model)
                FileRoutines.safe_mkdir(model_dir)
                out_file = "%s/%s/%s.out" % (filename_out_dir, model, basename)
                ctl_file = "%s/%s/%s.ctl" % (filename_out_dir, model, basename)

                options_list.append("%s.ctl" % basename)
                dir_list.append(model_dir)

                self.generate_ctl_file(os.path.abspath(filename),
                                       tree_file_abs_path,
                                       out_file,
                                       ctl_file,
                                       seq_type=seq_type,
                                       codon_frequency=codon_frequency,
                                       noisy=noisy,
                                       verbose=verbose,
                                       runmode=runmode,
                                       clock=clock,
                                       aminoacid_distance=aminoacid_distance,
                                       model=2,
                                       nssites=2,
                                       genetic_code=genetic_code,
                                       fix_kappa=fix_kappa,
                                       kappa=kappa,
                                       fix_omega=fix_omega_dict[model],
                                       omega=1,
                                       getSE=getSE,
                                       RateAncestor=RateAncestor,
                                       Mgene=0,
                                       small_difference=small_difference,
                                       clean_data=clean_data,
                                       method=method)

        self.parallel_execute(options_list, dir_list=dir_list)

        results_dict = OrderedDict()
        double_delta_dict = OrderedDict()
        raw_pvalues_dict = OrderedDict()
        raw_pvalues_list = []

        for basename in basename_dir_list:
            results_dict[basename] = OrderedDict()
            for model in model_list:
                output_file = "%s/%s/%s/%s.out" % (out_dir, basename, model,
                                                   basename)
                codeml_report = CodeMLReport(output_file)
                results_dict[basename][model] = codeml_report.LnL

        skipped_genes_set = set()
        for basename in basename_dir_list:
            for model in model_list:
                if results_dict[basename][model] is None:
                    print("LnL was not calculated for %s" % basename)
                    skipped_genes_set.add(basename)
                    break
            else:
                doubled_delta = 2 * (results_dict[basename]["Model_A"] -
                                     results_dict[basename]["Model_A_null"])
                p_value = chisqprob(doubled_delta, 1)  # degrees of freedom = 1

                double_delta_dict[basename] = doubled_delta
                raw_pvalues_dict[basename] = p_value
                raw_pvalues_list.append(p_value)

        adjusted_pvalues_list = fdrcorrection0(raw_pvalues_list)[1]
        #print adjusted_pvalues_list
        i = 0
        with open(results_file, "w") as out_fd:
            out_fd.write(
                "id\tmodel_a_null,LnL\tmodel_a,LnL\t2*delta\traw p-value\tadjusted p-value\n"
            )
            for basename in basename_dir_list:
                for model in model_list:
                    if results_dict[basename][model] is None:
                        print("LnL was not calculated for %s" % basename)
                        break
                else:
                    #doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"])
                    #p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1

                    #print basename, results_dict[basename]["Model_A_null"],results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i]

                    out_fd.write(
                        "%s\t%f\t%f\t%f\t%f\t%f\n" %
                        (basename, results_dict[basename]["Model_A_null"],
                         results_dict[basename]["Model_A"],
                         double_delta_dict[basename],
                         raw_pvalues_dict[basename], adjusted_pvalues_list[i]))
                    i += 1
Beispiel #3
0
    def parallel_codeml(self,
                        in_dir,
                        tree_file,
                        out_dir,
                        seq_type="codons",
                        codon_frequency="F3X4",
                        noisy=0,
                        verbose="concise",
                        runmode=0,
                        clock=0,
                        aminoacid_distance=None,
                        model=1,
                        nssites=0,
                        genetic_code=0,
                        fix_kappa=False,
                        kappa=5,
                        fix_omega=False,
                        omega=0.2,
                        getSE=0,
                        RateAncestor=0,
                        small_difference=0.000001,
                        clean_data=True,
                        method=0,
                        Mgene=None):

        FileRoutines.safe_mkdir(out_dir)
        alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir)
        tree_file_abs_path = os.path.abspath(tree_file)
        options_list = []
        dir_list = []
        for filename in alignment_files_list:
            directory, basename, extension = FileRoutines.split_filename(
                filename)
            filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename))
            out_file = "%s/%s.out" % (filename_out_dir, basename)
            ctl_file = "%s/%s.ctl" % (filename_out_dir, basename)

            options_list.append(ctl_file)
            dir_list.append(filename_out_dir)
            FileRoutines.safe_mkdir(filename_out_dir)
            self.generate_ctl_file(os.path.abspath(filename),
                                   tree_file_abs_path,
                                   out_file,
                                   ctl_file,
                                   seq_type=seq_type,
                                   codon_frequency=codon_frequency,
                                   noisy=noisy,
                                   verbose=verbose,
                                   runmode=runmode,
                                   clock=clock,
                                   aminoacid_distance=aminoacid_distance,
                                   model=model,
                                   nssites=nssites,
                                   genetic_code=genetic_code,
                                   fix_kappa=fix_kappa,
                                   kappa=kappa,
                                   fix_omega=fix_omega,
                                   omega=omega,
                                   getSE=getSE,
                                   RateAncestor=RateAncestor,
                                   Mgene=Mgene,
                                   small_difference=small_difference,
                                   clean_data=clean_data,
                                   method=method)
        self.parallel_execute(options_list, dir_list=dir_list)