Beispiel #1
0
    def generate_kmer_features(self):
        my_log = logging.getLogger('train:generate_kmers')
        fasta2kmers_command = utils.fasta2kmers_base(self.config, self.workingdir)
        fasta2kmers2_command = utils.fasta2kmers2_base(self.config, self.workingdir)

        use_fasta2kmers2 = int(self.config.settings["kmer_normalization"]) <= 1

        if use_fasta2kmers2:
            my_log.debug('basic command for generating kmers:\n{} -i INPUT -f OUTPUT'.format(fasta2kmers2_command))
        else:
            my_log.debug('basic command for generating kmers:\n{} INPUT >> OUTPUT'.format(fasta2kmers_command))

        tasks_by_fl = {}
        max_entries = 0
        for fl in self.config.settings["fragment_len"]:
            fl_tasks = []
            p = "{d}{sep}sampled_fasta{sep}{fl}".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep)
            outfile = "{d}{sep}train_data{sep}{fl}.sl".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep)

            files = os.listdir(p)
            for f in files:
                fastafile = os.path.join(p, f)
                if use_fasta2kmers2:
                    file_cmd = "{cmd} -i {fasta} -f {out}".format(cmd=fasta2kmers2_command, out=outfile, fasta=fastafile)
                else:
                    file_cmd = "{cmd} {fasta} >> {out}".format(cmd=fasta2kmers_command, out=outfile, fasta=fastafile)

                if self.config.settings["processors"] == 1:
                    s = os.system('{}'.format(file_cmd))
                    if s != 0:
                        my_log.critical("problem with generating kmers..:\n{}".format(file_cmd))
                        sys.exit(1)
                else:
                    fl_tasks.append(parallel.TaskCmd(file_cmd)) # attention, they all write to the same file

            if self.config.settings["processors"] > 1:
                tasks_by_fl[fl] = fl_tasks
                if len(fl_tasks) > max_entries:
                    max_entries = len(fl_tasks)

        if self.config.settings["processors"] > 1:
            # now run commands in parallel but only, if they do not write to the same file
            # this means you can only run len(fragment_len) commands in parallel
            # instead: concat files and use generate_kmer_features_concat()
            for i in range(max_entries):
                l = []
                for k in tasks_by_fl.keys():
                    try:
                        l.append(tasks_by_fl[k][i])
                    except IndexError:
                        pass
                if parallel.reportFailedCmd(parallel.runCmdParallel(l, self.config.settings["processors"])) is not None:  # Ivan change
                    sys.exit(-1)  # Ivan change
Beispiel #2
0
    def generate_kmer_features_concat(self):
        my_log = logging.getLogger('train:generate_kmers')
        fasta2kmers_command = utils.fasta2kmers_base(self.config, self.workingdir)
        fasta2kmers2_command = utils.fasta2kmers2_base(self.config, self.workingdir)

        use_fasta2kmers2 = int(self.config.settings["kmer_normalization"]) <= 1

        if use_fasta2kmers2:
            my_log.debug('basic command for generating kmers:\n{} -i INPUT -f OUTPUT'.format(fasta2kmers2_command))
        else:
            my_log.debug('basic command for generating kmers:\n{} INPUT >> OUTPUT'.format(fasta2kmers_command))

        tasks = []
        for fl in self.config.settings["fragment_len"]:
            p = "{d}{sep}sampled_fasta{sep}{fl}".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep)
            combined_fasta = os.path.join(p, "{}.all.fna".format(fl))
            outfile = "{d}{sep}train_data{sep}{fl}.sl".format(d=self.config.settings["project_dir"], fl=str(fl), sep=os.path.sep)
            os.system("cat {dir}{sep}*.fna > {dir}{sep}{fl}.all.tmp".format(dir=p, fl=fl, sep=os.path.sep))
            os.system("rm {dir}{sep}*.fna".format(dir=p, sep=os.path.sep))
            os.system("mv {p}{sep}{fl}.all.tmp {combined}".format(p=p, fl=fl, combined=combined_fasta, sep=os.path.sep))

            # in-efficent:
            # files = os.listdir(p)
            # s = os.system("mv {file0} {combined}".format(file0=os.path.join(p, files[0]), combined=combined_fasta))
            # if s != 0:
            #     sys.stderr.write("problem with moving file {}\n".format(os.path.join(p,files[0])))
            #     sys.exit(1)
            #
            # for f in files[1:]:
            #     s = os.system("cat {combined} {f} >> {combined}.tmp".format(dir=p, combined=combined_fasta, f=os.path.join(p, f)))
            #     if s != 0:
            #         sys.stderr.write("Problem with concatenating files in sampled_fasta/{fl}\n".format(fl=fl))
            #     os.system("mv {c}.tmp {c}".format(c=combined_fasta))
            #     os.remove(os.path.join(p, f))

            if use_fasta2kmers2:
                command = "{cmd} -i {combined} -f {out}".format(cmd=fasta2kmers2_command, combined=combined_fasta, out=outfile)
            else:
                command = "{cmd} {combined} >> {out}".format(cmd=fasta2kmers_command, combined=combined_fasta, out=outfile)

            if self.config.settings["processors"] > 1:
                tasks.append(parallel.TaskCmd(command))
            else:
                s = os.system('{}'.format(command))
                if s != 0:
                    my_log.critical("problem with generating kmers..:\n{}".format(command))
                    sys.exit(1)

        if self.config.settings["processors"] > 1:
            if parallel.reportFailedCmd(parallel.runCmdParallel(tasks, self.config.settings["processors"])) is not None:  # Ivan change
                sys.exit(-1)  # Ivan change