Exemple #1
0
    def generate_kmer_features(self):
        my_log = logging.getLogger('train:generate_kmers')
        fasta2kmers_command = utils.fasta2kmers_base(self.config, self.workingdir)
        fasta2kmers2_command = utils.fasta2kmers2_base(self.config, self.workingdir)

        use_fasta2kmers2 = int(self.config.settings["kmer_normalization"]) <= 1

        if use_fasta2kmers2:
            my_log.debug('basic command for generating kmers:\n{} -i INPUT -f OUTPUT'.format(fasta2kmers2_command))
        else:
            my_log.debug('basic command for generating kmers:\n{} INPUT >> OUTPUT'.format(fasta2kmers_command))

        tasks_by_fl = {}
        max_entries = 0
        for fl in self.config.settings["fragment_len"]:
            fl_tasks = []
            p = "{d}{sep}sampled_fasta{sep}{fl}".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep)
            outfile = "{d}{sep}train_data{sep}{fl}.sl".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep)

            files = os.listdir(p)
            for f in files:
                fastafile = os.path.join(p, f)
                if use_fasta2kmers2:
                    file_cmd = "{cmd} -i {fasta} -f {out}".format(cmd=fasta2kmers2_command, out=outfile, fasta=fastafile)
                else:
                    file_cmd = "{cmd} {fasta} >> {out}".format(cmd=fasta2kmers_command, out=outfile, fasta=fastafile)

                if self.config.settings["processors"] == 1:
                    s = os.system('{}'.format(file_cmd))
                    if s != 0:
                        my_log.critical("problem with generating kmers..:\n{}".format(file_cmd))
                        sys.exit(1)
                else:
                    fl_tasks.append(parallel.TaskCmd(file_cmd)) # attention, they all write to the same file

            if self.config.settings["processors"] > 1:
                tasks_by_fl[fl] = fl_tasks
                if len(fl_tasks) > max_entries:
                    max_entries = len(fl_tasks)

        if self.config.settings["processors"] > 1:
            # now run commands in parallel but only, if they do not write to the same file
            # this means you can only run len(fragment_len) commands in parallel
            # instead: concat files and use generate_kmer_features_concat()
            for i in range(max_entries):
                l = []
                for k in tasks_by_fl.keys():
                    try:
                        l.append(tasks_by_fl[k][i])
                    except IndexError:
                        pass
                if parallel.reportFailedCmd(parallel.runCmdParallel(l, self.config.settings["processors"])) is not None:  # Ivan change
                    sys.exit(-1)  # Ivan change
Exemple #2
0
    def generate_kmer_features_concat(self):
        my_log = logging.getLogger('train:generate_kmers')
        fasta2kmers_command = utils.fasta2kmers_base(self.config, self.workingdir)
        fasta2kmers2_command = utils.fasta2kmers2_base(self.config, self.workingdir)

        use_fasta2kmers2 = int(self.config.settings["kmer_normalization"]) <= 1

        if use_fasta2kmers2:
            my_log.debug('basic command for generating kmers:\n{} -i INPUT -f OUTPUT'.format(fasta2kmers2_command))
        else:
            my_log.debug('basic command for generating kmers:\n{} INPUT >> OUTPUT'.format(fasta2kmers_command))

        tasks = []
        for fl in self.config.settings["fragment_len"]:
            p = "{d}{sep}sampled_fasta{sep}{fl}".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep)
            combined_fasta = os.path.join(p, "{}.all.fna".format(fl))
            outfile = "{d}{sep}train_data{sep}{fl}.sl".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep)
            os.system("cat {dir}{sep}*.fna > {dir}{sep}{fl}.all.tmp".format(dir=p, fl=fl, sep=os.path.sep))
            os.system("rm {dir}{sep}*.fna".format(dir=p, sep=os.path.sep))
            os.system("mv {p}{sep}{fl}.all.tmp {combined}".format(p=p, fl=fl, combined=combined_fasta, sep=os.path.sep))

            # in-efficent:
            # files = os.listdir(p)
            # s = os.system("mv {file0} {combined}".format(file0=os.path.join(p, files[0]), combined=combined_fasta))
            # if s != 0:
            #     sys.stderr.write("problem with moving file {}\n".format(os.path.join(p,files[0])))
            #     sys.exit(1)
            #
            # for f in files[1:]:
            #     s = os.system("cat {combined} {f} >> {combined}.tmp".format(dir=p, combined=combined_fasta, f=os.path.join(p, f)))
            #     if s != 0:
            #         sys.stderr.write("Problem with concatenating files in sampled_fasta/{fl}\n".format(fl=fl))
            #     os.system("mv {c}.tmp {c}".format(c=combined_fasta))
            #     os.remove(os.path.join(p, f))

            if use_fasta2kmers2:
                command = "{cmd} -i {combined} -f {out}".format(cmd=fasta2kmers2_command, combined=combined_fasta, out=outfile)
            else:
                command = "{cmd} {combined} >> {out}".format(cmd=fasta2kmers_command, combined=combined_fasta, out=outfile)

            if self.config.settings["processors"] > 1:
                tasks.append(parallel.TaskCmd(command))
            else:
                s = os.system('{}'.format(command))
                if s != 0:
                    my_log.critical("problem with generating kmers..:\n{}".format(command))
                    sys.exit(1)

        if self.config.settings["processors"] > 1:
            if parallel.reportFailedCmd(parallel.runCmdParallel(tasks, self.config.settings["processors"])) is not None:  # Ivan change
                sys.exit(-1)  # Ivan change
Exemple #3
0
    def predict(self):
        sys.stdout.write("Predicting...\n")
        outfiles = set()

        # if commands should be run in parallel, store them in a list for each output file
        # run each list in parallel afterwards
        command_by_outputfile = {}

        for index in self.classifier.keys():
            if len(self.classifier[index]) == 0:
                continue
            self.classify(index, outfiles, command_by_outputfile)

        if self.config.settings["processors"] != 1:
            for commandlist in command_by_outputfile.values():
                if parallel.reportFailedCmd(parallel.runCmdParallel(commandlist, maxProc=self.config.settings["processors"])) is not None:  # Ivan change
                    sys.exit(-1)  # Ivan change
        # join all outputs
        self.combined_output_file = "{}.out".format(self.fastafile)
        utils.concat_files(outfiles, self.combined_output_file)
Exemple #4
0
    def build_models(self):
        my_log = logging.getLogger('train:build_models')
        # now as the training data is ready get the models
        # if no grid was given then just build models
        # kernel options
        kernel_opt = "-t {t} -g {g} -d {d} -s {s}".format(t=str(self.config.settings["kernel"]),
                                                           g=str(self.config.settings["kernel_rbf_gamma"]),
                                                           d=str(self.config.settings["kernel_polynomial_degree"]),
                                                           s=str(self.config.settings["kernel_polynomial_s"]))
        loss_opt = "-l {l} --L {L}".format(l=str(self.loss_function),
                                            L=str(self.config.settings["loss_action"]))
        other_opt = "--z {z} --v {v} --t {t}".format(z=str(self.z_standardization),
                                                      v=str(self.misc_nodes),
                                                      t=self.tree_file)
        learn_command = "{bin} {kernel} {loss} {other} " \
                        "-v 1 -o 2".format(bin=utils.path_to_binary(self.workingdir, "svm_phylo_learn"),
                                           kernel=kernel_opt,
                                           loss=loss_opt,
                                           other=other_opt)
        cv_command = "{bin} {kernel} {loss} {other} " \
                     "-x 3 -v 1 -o 2 --r 1 --S 1".format(bin=utils.path_to_binary(self.workingdir, "svm_phylo_cv"),
                                                         kernel=kernel_opt,
                                                         loss=loss_opt,
                                                         other=other_opt)
        if self.config.settings["balance_classes"]:
            learn_command = "{} --c 1".format(learn_command)
            cv_command = "{} --c 1".format(cv_command)

        my_log.debug('basic crossvalidation command:\n{} -c CVAL KMER_FILE'.format(cv_command))
        my_log.debug('basic learning command:\n{} -c CVAL KMER_FILE MODEL_FILE'.format(learn_command))

        tasks = []  # only needed when running in parallel

        if len(self.config.settings["c_grid"]) == 1:
            c_val = self.config.settings["c_grid"][0]
            for fl in self.config.settings["fragment_len"]:
                learn_command_final = "{cmd} -c {cval} " \
                                      "{p}{sep}train_data{sep}{fl}.sl " \
                                      "{p}{sep}models{sep}{fl}_c{cval}.svm".format(cmd=learn_command,
                                                                                   sep=os.path.sep,
                                                                                   cval=c_val,
                                                                                   p=self.config.settings["project_dir"],
                                                                                   fl=fl)
                if self.config.settings["processors"] == 1:
                    my_log.info("build {fl} length model with c={cval}".format(fl=fl, cval=c_val))
                    s = os.system(learn_command_final)
                    if s != 0:
                        my_log.critical("something went wrong with building the model:\n{}".format(learn_command_final))
                        sys.exit(1)
                else:
                    tasks.append(parallel.TaskCmd(learn_command_final))

        else:
            # crossvalidation
            for fl in self.config.settings["fragment_len"]:
                my_log.info("Cross-validatong {} length model.".format(fl))
                cv_loss = []
                cv_zeroone = []
                for c_val in self.config.settings["c_grid"]:
                    my_log.debug("c={}".format(c_val))
                    cv_command_final = "{cmd} -c {cval} " \
                                       "{p}{sep}train_data{sep}{fl}.sl".format(cmd=cv_command,
                                                                               sep=os.path.sep,
                                                                               cval=c_val,
                                                                               p=self.config.settings["project_dir"],
                                                                               fl=fl)
                    fr = sys.stdin
                    os.system(cv_command_final)
                    lines = fr.readlines()
                    fr.close()
                    floating_point = re.compile(r'\d+\.\d+')
                    for line in lines:
                        if "Average loss in cross-validation" in line:
                            loss = floating_point.findall(line)
                            try:
                                loss = float(loss[0])
                                cv_loss.append(loss)
                            except IndexError:
                                continue
                        if "one-error in cross-validation" in line:
                            loss = floating_point.findall(line)
                            try:
                                loss = float(loss[0])
                                cv_zeroone.append(loss)
                            except IndexError:
                                continue

                if len(cv_loss) != len(self.config.settings["c_grid"]):
                    my_log.critical("Error, something went wrong with cross-validation "
                                    "of {} length fragment model. Quitting".format(fl))   # exit or continue?
                    sys.exit(1)
                my_log.debug("C grid: " + utils.any_list_to_string(self.config.settings["c_grid"]))
                my_log.debug("CV loss: " + utils.any_list_to_string(cv_loss))
                my_log.debug("CV 0-1: " + utils.any_list_to_string(cv_zeroone))

                loss_min = min(cv_loss)
                i = cv_loss.index(loss_min)
                # build model for minimum loss
                c_val = self.config.settings["c_grid"][i]
                learn_command_final = "{cmd} -c {cval} " \
                                      "{p}{sep}train_data{sep}{fl}.sl " \
                                      "{p}{sep}models{sep}{fl}_c{cval}.svm".format(cmd=learn_command,
                                                                                   sep=os.path.sep,
                                                                                   cval=c_val,
                                                                                   p=self.config.settings["project_dir"],
                                                                                   fl=fl)
                my_log.info("build {fl} length model with c={cval} and CV-loss={loss}".format(fl=fl,
                                                                                                cval=c_val,
                                                                                                loss=loss_min))
                if self.config.settings["processors"] == 1:
                    s = os.system("{}".format(learn_command_final))
                    if s != 0:
                        my_log.critical("something went wrong with building the model:\n{}".format(learn_command_final))
                        sys.exit(1)
                else:
                    tasks.append(parallel.TaskCmd(learn_command_final))

        if self.config.settings["processors"] > 1:
            my_log.info("building models in parallel...")
            if parallel.reportFailedCmd(parallel.runCmdParallel(tasks, self.config.settings["processors"])) is not None:  # Ivan change
                sys.exit(-1)  # Ivan change