Exemple #1
0
    def classify(self, index, outfiles, command_by_outputfile):
        classifier_command = "{} -v 0 ".format(utils.path_to_binary(self.workingdir, "svm_phylo_classify"))
        ensemble_command = "{} -v 0 -e 1 ".format(utils.path_to_binary(self.workingdir, "svm_phylo_classify_ensemble"))
        fl = self.config.settings["fragment_len"][index]
        test_file = "{fasta}.{fl}.sl".format(fasta=self.fastafile, fl=fl)
        out_file = "{fasta}.{fl}.out".format(fasta=self.fastafile, fl=fl)
        outfiles.add(out_file)
        # get what classifiers to use
        classifiers_to_use = self.config.settings["fragment_len"][index:min(index + self.config.settings["n_classifiers"],
                                                                            len(self.config.settings["fragment_len"]))]

        classifier_to_use_string = ""
        for c in classifiers_to_use:
            classifier_to_use_string += str(c)
            classifier_to_use_string += ","
        classifier_to_use_string = classifier_to_use_string[:len(classifier_to_use_string)-1]
        sys.stdout.write("\tfragments close to length {fl} with classifiers {c}\n".format(fl=fl,
                                                                                          c=classifier_to_use_string))
        n_classifiers_to_use = len(classifiers_to_use)
        if n_classifiers_to_use == 1:
            extra_command = "{test} {model} {out}".format(test=test_file, model=self.models[fl], out=out_file)
            final_command = "{classifier}{extra}".format(classifier=classifier_command, extra=extra_command)
        else:
            extra_command = "-m {n} {test} ".format(n=n_classifiers_to_use, test=test_file)
            for i in classifiers_to_use:
                extra_command = "{cmd}{model} ".format(cmd=extra_command, model=self.models[i])
            extra_command = "{cmd}{out} ".format(cmd=extra_command, out=out_file)
            final_command = "{cmd}{ext}".format(cmd=ensemble_command, ext=extra_command)

        if self.config.settings["processors"] != 1:
            sys.stdout.write("running predictions in parallel\n")
            if out_file not in command_by_outputfile:
                command_by_outputfile[out_file] = [parallel.TaskCmd(final_command)]
            else:
                command_by_outputfile[out_file].append(parallel.TaskCmd(final_command))
        else:
            s = os.system(final_command)
            if s != 0:
                sys.stderr.write("Error in classification: couldn't run the system command.\n")
                sys.exit(1)
Exemple #2
0
    def generate_kmers(self):
        kmer_strings = map(lambda x: str(x), self.config.settings["kmer"])
        sys.stdout.write("\nGenerating k-mer features ({})...\n".format("-".join(kmer_strings)))

        fasta2kmers_command = "{script} -s 1 -o 1 -h 1 -l 0 -C {rm} -t {n} -r {rev}".format(script=utils.path_to_binary(self.workingdir, "fasta2kmers"),
                                                                                            rm=self.config.settings["rm_rev_complement"],
                                                                                            n=self.config.settings["kmer_normalization"],
                                                                                            rev=self.config.settings["rev_complement"])
        fasta2kmers2_command = "{script} -a w -s 1 -l 2 -o 1 -b 1 " \
                               "-R {rm} -h 1 " \
                               "-n {n} " \
                               "-r {rev}".format(script=utils.path_to_binary(self.workingdir, "fasta2kmers2"),
                                                  rm=self.config.settings["rm_rev_complement"],
                                                  n=self.config.settings["kmer_normalization"],
                                                  rev=self.config.settings["rev_complement"])

        if len(self.config.settings["kmer"]) == 1:
            k = self.config.settings["kmer"][0]
            fasta2kmers_command = "{cmd} -k {kmer}".format(cmd=fasta2kmers_command, kmer=k)
            fasta2kmers2_command = "{cmd} -k {kmer} -j {kmer}".format(cmd=fasta2kmers2_command, kmer=k)
        else:
            k = max(self.config.settings["kmer"])
            j = min(self.config.settings["kmer"])
            fasta2kmers_command = "{cmd} -k {k} -j {j}".format(cmd=fasta2kmers_command, k=k, j=j)
            fasta2kmers2_command = "{cmd} -k {k} -j {j}".format(cmd=fasta2kmers2_command, k=k, j=j)

        fasta2kmers_command_final = "{cmd} {input} | sed 's/^/1 /' > {output}.sl".format(cmd=fasta2kmers_command,
                                                                                         input=self.fastafile_filtered,
                                                                                         output=self.fastafile)
        fasta2kmers2_command_final = "{cmd} -i {input} -f {output}.sl".format(cmd=fasta2kmers2_command,
                                                                              input=self.fastafile_filtered,
                                                                              output=self.fastafile)

        use_fasta2kmers2 = int(self.config.settings["kmer_normalization"]) <= 1
        if use_fasta2kmers2:
            cmd = fasta2kmers2_command_final
            s = os.system(fasta2kmers2_command_final)
        else:
            cmd = fasta2kmers2_command_final
            s = os.system(fasta2kmers_command_final)
        if s != 0:
            sys.stderr.write("Generating kmers failed with command:\n{cmd}".format(cmd=cmd))
            sys.exit(1)
        sys.stdout.write("done\n")
Exemple #3
0
    def build_models(self):
        my_log = logging.getLogger('train:build_models')
        # now as the training data is ready get the models
        # if no grid was given then just build models
        # kernel options
        kernel_opt = "-t {t} -g {g} -d {d} -s {s}".format(t=str(self.config.settings["kernel"]),
                                                           g=str(self.config.settings["kernel_rbf_gamma"]),
                                                           d=str(self.config.settings["kernel_polynomial_degree"]),
                                                           s=str(self.config.settings["kernel_polynomial_s"]))
        loss_opt = "-l {l} --L {L}".format(l=str(self.loss_function),
                                            L=str(self.config.settings["loss_action"]))
        other_opt = "--z {z} --v {v} --t {t}".format(z=str(self.z_standardization),
                                                      v=str(self.misc_nodes),
                                                      t=self.tree_file)
        learn_command = "{bin} {kernel} {loss} {other} " \
                        "-v 1 -o 2".format(bin=utils.path_to_binary(self.workingdir, "svm_phylo_learn"),
                                           kernel=kernel_opt,
                                           loss=loss_opt,
                                           other=other_opt)
        cv_command = "{bin} {kernel} {loss} {other} " \
                     "-x 3 -v 1 -o 2 --r 1 --S 1".format(bin=utils.path_to_binary(self.workingdir, "svm_phylo_cv"),
                                                         kernel=kernel_opt,
                                                         loss=loss_opt,
                                                         other=other_opt)
        if self.config.settings["balance_classes"]:
            learn_command = "{} --c 1".format(learn_command)
            cv_command = "{} --c 1".format(cv_command)

        my_log.debug('basic crossvalidation command:\n{} -c CVAL KMER_FILE'.format(cv_command))
        my_log.debug('basic learning command:\n{} -c CVAL KMER_FILE MODEL_FILE'.format(learn_command))

        tasks = []  # only needed when running in parallel

        if len(self.config.settings["c_grid"]) == 1:
            c_val = self.config.settings["c_grid"][0]
            for fl in self.config.settings["fragment_len"]:
                learn_command_final = "{cmd} -c {cval} " \
                                      "{p}{sep}train_data{sep}{fl}.sl " \
                                      "{p}{sep}models{sep}{fl}_c{cval}.svm".format(cmd=learn_command,
                                                                                   sep=os.path.sep,
                                                                                   cval=c_val,
                                                                                   p=self.config.settings["project_dir"],
                                                                                   fl=fl)
                if self.config.settings["processors"] == 1:
                    my_log.info("build {fl} length model with c={cval}".format(fl=fl, cval=c_val))
                    s = os.system(learn_command_final)
                    if s != 0:
                        my_log.critical("something went wrong with building the model:\n{}".format(learn_command_final))
                        sys.exit(1)
                else:
                    tasks.append(parallel.TaskCmd(learn_command_final))

        else:
            # crossvalidation
            for fl in self.config.settings["fragment_len"]:
                my_log.info("Cross-validatong {} length model.".format(fl))
                cv_loss = []
                cv_zeroone = []
                for c_val in self.config.settings["c_grid"]:
                    my_log.debug("c={}".format(c_val))
                    cv_command_final = "{cmd} -c {cval} " \
                                       "{p}{sep}train_data{sep}{fl}.sl".format(cmd=cv_command,
                                                                               sep=os.path.sep,
                                                                               cval=c_val,
                                                                               p=self.config.settings["project_dir"],
                                                                               fl=fl)
                    fr = sys.stdin
                    os.system(cv_command_final)
                    lines = fr.readlines()
                    fr.close()
                    floating_point = re.compile(r'\d+\.\d+')
                    for line in lines:
                        if "Average loss in cross-validation" in line:
                            loss = floating_point.findall(line)
                            try:
                                loss = float(loss[0])
                                cv_loss.append(loss)
                            except IndexError:
                                continue
                        if "one-error in cross-validation" in line:
                            loss = floating_point.findall(line)
                            try:
                                loss = float(loss[0])
                                cv_zeroone.append(loss)
                            except IndexError:
                                continue

                if len(cv_loss) != len(self.config.settings["c_grid"]):
                    my_log.critical("Error, something went wrong with cross-validation "
                                    "of {} length fragment model. Quitting".format(fl))   # exit or continue?
                    sys.exit(1)
                my_log.debug("C grid: " + utils.any_list_to_string(self.config.settings["c_grid"]))
                my_log.debug("CV loss: " + utils.any_list_to_string(cv_loss))
                my_log.debug("CV 0-1: " + utils.any_list_to_string(cv_zeroone))

                loss_min = min(cv_loss)
                i = cv_loss.index(loss_min)
                # build model for minimum loss
                c_val = self.config.settings["c_grid"][i]
                learn_command_final = "{cmd} -c {cval} " \
                                      "{p}{sep}train_data{sep}{fl}.sl " \
                                      "{p}{sep}models{sep}{fl}_c{cval}.svm".format(cmd=learn_command,
                                                                                   sep=os.path.sep,
                                                                                   cval=c_val,
                                                                                   p=self.config.settings["project_dir"],
                                                                                   fl=fl)
                my_log.info("build {fl} length model with c={cval} and CV-loss={loss}".format(fl=fl,
                                                                                                cval=c_val,
                                                                                                loss=loss_min))
                if self.config.settings["processors"] == 1:
                    s = os.system("{}".format(learn_command_final))
                    if s != 0:
                        my_log.critical("something went wrong with building the model:\n{}".format(learn_command_final))
                        sys.exit(1)
                else:
                    tasks.append(parallel.TaskCmd(learn_command_final))

        if self.config.settings["processors"] > 1:
            my_log.info("building models in parallel...")
            if parallel.reportFailedCmd(parallel.runCmdParallel(tasks, self.config.settings["processors"])) is not None:  # Ivan change
                sys.exit(-1)  # Ivan change