Example #1
0
    def __init__(self, bam, output, context_file):
        """Parameter initialization"""
        self.bam_file = bam
        self.in_bam = pysam.AlignmentFile(bam, "rb")
        self.filtered_bam = pysam.AlignmentFile(output,
                                                "wb",
                                                template=self.in_bam)
        # a list of ints is used to count pairs assigned to different filters
        # 0: count_input_alignments
        # 1: count_input_pairs
        # 2: count_filtered_pairs
        # 3: count_multimapped
        # 4: count_star_chimeric_alignments
        # 5: count_qcd_alignments
        # 6: count_unmapped
        # 7: count_10bp_s_clip
        # 8: count_proper_pair
        # 9: count_not_filtered_but_in_fusion_gene
        self.counter = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        self.last_time = 0
        self.logger = Logger("{}.fusionReadFilterLog".format(output))

        self.coord_dict = {}
        self.get_ranges(context_file)
        print(self.coord_dict)
Example #2
0
 def __init__(self, ftype):
     self.__ftype = ftype
     self.__redis_link = self.__redis_connect()
     mylog = Logger(
         os.path.join(os.path.abspath(os.path.curdir),
                      'misc/spider_log.yaml'))
     self.__logger = mylog.outputLog()
Example #3
0
def main():
    mylog = Logger(
        os.path.join(os.path.abspath(os.path.curdir), 'misc/spider_log.yaml'))
    logger = mylog.outputLog()
    year = input("请输入年份:")
    allurl = get_links(year)
    downurl(allurl, logger)
Example #4
0
File: meiju.py Project: zzcv/python
def main():
    print("欢迎使用 美剧天堂 爬取脚本")
    print("=" * 20)
    print("魔幻/科幻:1\n灵异/惊悚:2\n都市/感情:3\n犯罪/历史:4\n选秀/综艺:5\n动漫/卡通:6")
    print("=" * 20)
    ftype = input('请输入需要爬取的类型的代号:')
    start_url = "http://www.meijutt.com/file/list%s.html" % ftype
    ourl = openurl.OpenUrl(start_url, 'gb2312')
    code, doc = ourl.openurl()
    mylog = Logger(
        os.path.join(os.path.abspath(os.path.curdir), 'misc/spider_log.yaml'))
    logger = mylog.outputLog()
    if code == 200:
        selecter = etree.HTML(doc)
        pages = selecter.xpath(
            "//div[@class='page']/span/text()")[0].split()[0].split('/')[1]
        firstpage_links = selecter.xpath("//a[@class='B font_14']/@href")
        for firstpage_link in firstpage_links:
            name, download_links = get_downlink(firstpage_link)
            send_mysql(name, download_links, logger)
            time.sleep(0.5)

        for page in range(2, int(pages)):
            page_url = 'http://www.meijutt.com/file/list%s_%s.html' % (ftype,
                                                                       page)
            for link in page_link(page_url):
                name, download_links = get_downlink(link)
                if name != '' and download_links != '':
                    send_mysql(name, download_links, logger)
                    time.sleep(0.5)
    else:
        print("[%s] error..." % start_url)

    print("Done.")
Example #5
0
 def __init__(self, cmd, input_paths, working_dir):
     """Parameter initiation and work folder creation. Start of progress logging."""
     self.working_dir = os.path.abspath(working_dir)
     self.logger = Logger(os.path.join(self.working_dir, "easyfuse_processing.log"))
     IOMethods.create_folder(self.working_dir, self.logger)
     copy(os.path.join(cfg.module_dir, "config.py"), working_dir)
     self.logger.info("Starting easyfuse: CMD - {}".format(cmd))
     self.input_paths = [os.path.abspath(file) for file in input_paths]
     self.samples = SamplesDB(os.path.join(self.working_dir, "samples.db"))
Example #6
0
 def __init__(self, scratch_path, fetchdata_path, sample_id):
     """Parameter initiation and work folder creation."""
     self.scratch_path = scratch_path
     self.fetchdata_path = fetchdata_path
     self.sample_id = sample_id
     #self.tools = Samples(os.path.join(scratch_path, os.path.pardir, os.path.pardir, "samples.csv")).get_tool_list_from_state(self.sample_id)
     self.samples = SamplesDB(
         os.path.join(scratch_path, os.path.pardir, "samples.db"))
     self.logger = Logger(os.path.join(self.fetchdata_path,
                                       "fetchdata.log"))
Example #7
0
 def __init__(self):
     self.__redis_link = self.__redis_connect()
     mylog = Logger(
         os.path.join(os.path.abspath(os.path.curdir),
                      'misc/spider_log.yaml'))
     self.__logger = mylog.outputLog()
     self.mysql_connect = mysql_connect.MysqlConnect(
         os.path.join(os.path.abspath(os.path.curdir),
                      'misc/mysql_data.yaml'))
     self.main_url = 'http://www.hanfan.cc/'
 def __init__(self, scratch_path, fusion_output_path, sample_id,
              tool_num_cutoff, fusiontool_list, sample_log):
     """Parameter initiation and work folder creation."""
     self.scratch_path = scratch_path
     self.fusion_output_path = fusion_output_path
     self.sample_id = sample_id
     self.tool_num_cutoff = int(tool_num_cutoff)
     # urla: if we want to be more generic and allow different annotations, identification of the chr names
     #       (eg "chr1" vs "1" and "chrM" vs "MT") should be performed in advance
     self.chr_list = ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
                      "11", "12", "13", "14", "15", "16", "17", "18", "19",
                      "20", "21", "22", "X", "Y", "MT")
     self.tools = fusiontool_list.split(",")
     self.logger = Logger(sample_log)
Example #9
0
class Fusionreadfilter(object):
    """Select alignments belonging to putative fusions from an s/bam file"""
    def __init__(self, bam, output):
        """Parameter initialization"""
        self.bam_file = bam
        self.in_bam = pysam.AlignmentFile(bam, "rb")
        self.filtered_bam = pysam.AlignmentFile(output,
                                                "wb",
                                                template=self.in_bam)
        # a list of ints is used to count pairs assigned to different filters
        # 0: count_input_alignments
        # 1: count_input_pairs
        # 2: count_filtered_pairs
        # 3: count_multimapped
        # 4: count_star_chimeric_alignments
        # 5: count_qcd_alignments
        # 6: count_unmapped
        # 7: count_10bp_s_clip
        # 8: count_proper_pair
        self.counter = [0, 0, 0, 0, 0, 0, 0, 0, 0]
        self.last_time = 0
        self.logger = Logger("{}.fusionReadFilterLog".format(output))

    def classify_pair(self, read1, read2, read1_flag, read2_flag,
                      member_count):
        """Classify a read pairs into different groups"""
        self.counter[1] += 1

        # count star chimeric
        if member_count > 2:
            self.counter[4] += 1
            return True
        # count multimapping reads
        if read1.get_tag("NH") > 1 or read2.get_tag("NH") > 1:
            self.counter[3] += 1
            return False
        # count pairs containing unmapped mates
        if read1_flag & 0x4 or read2_flag & 0x4:
            self.counter[6] += 1
            return True
        # check if the pair is a "proper pair" (both mapped, adequat distance)
        if not read1_flag & 0x2 or not read2_flag & 0x2:
            self.counter[8] += 1
            return True
        # check if 10 (default) or more soft-clippings (S) are in the alignment
        if read1.query_alignment_length + 10 <= read1.query_length or read2.query_alignment_length + 10 <= read2.query_length:
            self.counter[7] += 1
            return True
        return False

    def run(self):
        """Walk linewise through a s/bam file and send proper read pairs to classification"""
        self.logger.info("Starting fusion read filtering")
        read1 = read2 = None
        read_flag = read1_flag = read2_flag = None
        self.last_time = time.time()
        last_query = ""
        count_current_query_member = 0
        # read filtering works as follows:
        # urla: I wrote the processing in a way, that implementing the analyses of multi-mapping reads should be straight forward
        #       For this, however, one would need to check in the group of multi-mapping alignments whether there is no possible "normal mapping"
        #       I could so far think of two possible solutions: (1) running "classify_pair" on all possible combinations of the multimapping pairs,
        #       until a "normal mapping" was found. If this is never the case, the pair is written to the filtered out. This, however, will have
        #       a strong effect on runtimes! (2) with the chimeric multi-mapping setting in the most recent star versions. This should actually
        #       work directly as star chimeric classification outranks multi-mapping disposal. It has, however, not been throughly evaluated by the star community
        for read in self.in_bam.fetch(
                until_eof=True
        ):  # iterate through alignments as they appear in the file (this is mandatory becaue (a) we cannot create an index, (b) want to include unmapped reads and (c) have many references in the header during 2nd filtering)
            self.counter[0] += 1
            read_flag = read.flag

            if last_query != read.query_name and self.counter[0] > 1:
                if self.classify_pair(read1, read2, read1_flag, read2_flag,
                                      count_current_query_member):
                    self.filtered_bam.write(read1)
                    self.filtered_bam.write(read2)
                    self.counter[2] += 1
                read1 = read2 = None
                read1_flag = read2_flag = None
                count_current_query_member = 0

            count_current_query_member += 1
            # ignore all alignments which are either supplemental, vendor qc'd, secondary or duplicates
            # urla: duplicate flagging/removal prior to running this script would most probably lead to several errors
            #       Nevertheless, it is a very fast check and I would also not recommend that someone does deduplication on rna-seq data!
            if read_flag > 255:
                self.counter[5] += 1
            else:
                # urla: the following commented version should be fine, but the other one is still kept in order to be more error-aware
                #                if read.is_read1:
                #                    read1 = read
                #                else:
                #                    read2 = read
                if not read1 and read.is_read1:
                    read1 = read
                    read1_flag = read_flag
                elif not read2 and read.is_read2:
                    read2 = read
                    read2_flag = read_flag
                else:
                    self.logger.error(
                        "Neither r1 nor r2??? Read: {0}; R1: {1}; R2: {2}; bamLine: {3}"
                        .format(read, read1, read2, self.counter[0]))
                    sys.exit(1)
            # urla: uncomment the following, if you'd like to have stats updates during the run and not only at the end


#            if count_input_alignments % 1000000 == 0:
#                self.print_stats(count_input_alignments)
            last_query = read.query_name

        # once EOF is reached, the very last pair has to be classified additionally
        if self.classify_pair(read1, read2, read1_flag, read2_flag,
                              count_current_query_member):
            self.filtered_bam.write(read1)
            self.filtered_bam.write(read2)
            self.counter[2] += 1

        # close reading/writing stream
        self.in_bam.close()
        self.filtered_bam.close()
        self.print_stats()
        self.logger.info("Finished fusion read filtering")

    def print_stats(self):
        """print collected statistics to the log file"""
        this_time = time.time()
        time_taken = this_time - self.last_time
        time_taken_1m = float(time_taken * 1000000) / float(self.counter[0])
        self.last_time = this_time
        self.logger.info(
            "Processed {0} alignments, {1} of {2} pairs remained after filtering ({3:.2%}) ({4:.2f}s / 1M alignments; {5:.2f}s in total)"
            .format(self.counter[0], self.counter[2], self.counter[1],
                    float(self.counter[2]) / float(self.counter[1]),
                    time_taken_1m, time_taken))

        qc1 = False
        if self.get_input_read_count_from_star() == self.counter[1]:
            qc1 = True
        qc2 = False
        if (self.counter[4] == self.counter[5]) and (
            (self.counter[0] - self.counter[5]) * 0.5 == self.counter[1]):
            qc2 = True

        # 0: count_input_alignments
        # 1: count_input_pairs
        # 2: count_filtered_pairs
        # 3: count_multimapped
        # 4: count_star_chimeric_alignments
        # 5: count_qcd_alignments
        # 6: count_unmapped
        # 7: count_10bp_s_clip
        # 8: count_proper_pair
        self.logger.info(
            "Star_chimeric (chim alignment from star):\t{} pairs (filtered)".
            format(self.counter[4]))
        self.logger.info(
            "QC'd (additional Star_chimeric alignment):\t{} alignments (included in above)"
            .format(self.counter[5]))
        self.logger.info(
            "Multimapped (1 < x <= 100 equal mappings):\t{} pairs (discarded)".
            format(self.counter[3]))
        self.logger.info(
            "Unmapped (no mapping or >100 multi map):\t{} pairs (filtered)".
            format(self.counter[6]))
        self.logger.info(
            "No proper pair (unexpected read distance):\t{} pairs (filtered)".
            format(self.counter[8]))
        self.logger.info(
            "10bp_s_clip (>9bp soft-clipped in cigar):\t{} pairs (filtered)".
            format(self.counter[7]))
        self.logger.info(
            "Unlikely chimeric (\"normal\" mappings): \t{} pairs (discarded)".
            format(self.counter[1] - self.counter[4] - self.counter[3] -
                   self.counter[6] - self.counter[8] - self.counter[7]))
        self.logger.info(
            "Filter QC1 (fq reads = bam alignments):\t{}".format(qc1))
        self.logger.info(
            "Filter QC2 (QC'd alignments are chimeric):\t{}".format(qc2))

    def get_input_read_count_from_star(self):
        """Parses a star output log file to get input read counts from the fastq origin"""
        log_file = "{}Log.final.out".format(
            self.bam_file.rstrip("Aligned.out.bam"))
        with open(log_file, "r") as star_log:
            for line in star_log:
                if line.split("|")[0].strip() == "Number of input reads":
                    return int(line.split("|")[1].strip())
        return -1
Example #10
0
parser = argparse.ArgumentParser(description='GAN without MI')
parser.add_argument('--config', type=str, default='./configs/spiral_mine.yml',
                        help = 'Path to config file')
opts = parser.parse_args()
params = get_config(opts.config)
print(params)

train_loader, val_loader = spiral_dataloader(params)

if params['use_mine']:
    model = GAN_MI(params)
else:
    model = GAN(params)

if params['use_cuda']:
    model = model.cuda()

logger = Logger(params['logs'])

exp_logs = params['logs'] + params['exp_name'] + '_' + timestamp + '/' 
exp_results = params['results'] + params['exp_name'] + '_' + timestamp + '/'
mkdir_p(exp_logs)
mkdir_p(exp_results)

if params['use_mine']:
    gan_trainer = GANTrainerMI(model, params, train_loader, val_loader, logger, exp_results, exp_logs)
else:
    gan_trainer = GANTrainerVanilla(model, params, train_loader, val_loader, logger, exp_results, exp_logs)

gan_trainer.train()
Example #11
0
class Processing(object):
    """Run, monitor and schedule fastq processing for fusion gene prediction"""
    def __init__(self, cmd, input_paths, working_dir):
        """Parameter initiation and work folder creation. Start of progress logging."""
        self.working_dir = os.path.abspath(working_dir)
        self.logger = Logger(os.path.join(self.working_dir, "easyfuse_processing.log"))
        IOMethods.create_folder(self.working_dir, self.logger)
        copy(os.path.join(cfg.module_dir, "config.py"), working_dir)
        self.logger.info("Starting easyfuse: CMD - {}".format(cmd))
        self.input_paths = [os.path.abspath(file) for file in input_paths]
        self.samples = SamplesDB(os.path.join(self.working_dir, "samples.db"))

    # The run method simply greps and organises fastq input files.
    # Fastq pairs (single end input is currently not supported) are then send to "execute_pipeline"
    def run(self, tool_num_cutoff):
        """General parameter setting, identification of fastq files and initiation of processing"""
        self.logger.info("Pipeline Version: {}".format(cfg.version))
        # Checking dependencies
        #VersCont(os.path.join(cfg.module_dir, "dependency_versions.txt")).get_and_print_tool_versions()
        #self.cfg.run_self_test()
        # urla: organism is currently not used for anything, however, this might change; is mouse processing relevant at some point?
        ref_genome = cfg.ref_genome_build
        ref_trans = cfg.ref_trans_version

        self.logger.info("Reference Genome: {0}, Reference Transcriptome: {1}".format(ref_genome, ref_trans))
 #       if self.overwrite:
 #           self.logger.info("#############################################################################")
 #           self.logger.info("")
 #           self.logger.info("Overwrite flag is set => all previously existing results may be overwritten!")
 #           self.logger.info("")
 #           self.logger.info("#############################################################################")


        sample_list = []
        # get fastq files
        left, right, sample_id = IOMethods.get_fastq_files(self.input_paths, self.logger)
        sample_list = sample_id
        for i, _ in enumerate(left):
            if len(left) == len(right):
                self.logger.info("Processing Sample ID: {} (paired end)".format(sample_id[i]))
                self.logger.info("Sample 1: {}".format(left[i]))
                self.logger.info("Sample 2: {}".format(right[i]))
                self.execute_pipeline(left[i], right[i], sample_id[i], ref_genome, ref_trans, tool_num_cutoff)

        
        # summarize all data if selected
        if "Summary" in cfg.tools:
            #dependency = [Queueing.get_jobs_by_name("Fetchdata-{}".format(sample)) for sample in sample_list]
            # urla - note: would be happy to get the dependencies with a stacked LC, but is atm to complicated for me ^^
            dependency = []
            for sample in sample_list:
                dependency.extend(Queueing.get_jobs_by_name("Fetchdata-{}".format(sample), cfg.queueing_system))
            modelling_string = ""
            if cfg.other_files["easyfuse_model"]:
                modelling_string = " --model_predictions"
            cmd_summarize = "python {0} --input {1}{2}".format(os.path.join(cfg.module_dir, "summarize_data.py"), self.working_dir, modelling_string)
            self.logger.debug("Submitting slurm job: CMD - {0}; PATH - {1}; DEPS - {2}".format(cmd_summarize, self.working_dir, dependency))
            cpu = cfg.resources["summary"]["cpu"]
            mem = cfg.resources["summary"]["mem"]
            self.submit_job("-".join([cfg.pipeline_name, "Summary", str(int(round(time.time())))]), cmd_summarize, cpu, mem, self.working_dir, dependency, cfg.receiver)

    # Per sample, define input parameters and execution commands, create a folder tree and submit runs to slurm
    def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_cutoff):
        """Create sample specific subfolder structure and run tools on fastq files"""
        self.samples.add_sample(sample_id, "NA", fq1, fq2)

        refs = cfg.references

        # Genome/Gene references to use
        genome_sizes_path = refs["genome_sizes"]
        genome_chrs_path = refs["genome_fastadir"]
        genes_fasta_path = refs["genes_fasta"]
        genes_gtf_path = refs["genes_gtf"]

        # Path' to specific indices
        indices = cfg.indices
        other_files = cfg.other_files

        bowtie_index_path = indices["bowtie"]
        star_index_path = indices["star"]
#        kallisto_index_path = indices["kallisto"]
#        pizzly_cache_path = "{}.pizzlyCache.txt".format(genes_gtf_path)
        starfusion_index_path = indices["starfusion"]
        fusioncatcher_index_path = indices["fusioncatcher"]
        infusion_cfg_path = other_files["infusion_cfg"]
#        starchip_param_path = other_files["starchip_param"]

        # Output results folder creation - currently included:
        # 1) Gene/Isoform expression: kallisto, star
        # 2) Fusion prediction: mapsplice, pizzly, fusioncatcher, star-fusion, starchip, infusion
        output_results_path = os.path.join(self.working_dir, "Sample_{}".format(sample_id))
        qc_path = os.path.join(output_results_path, "qc")
        skewer_path = os.path.join(qc_path, "skewer")
        qc_table_path = os.path.join(qc_path, "qc_table.txt")
        overrepresented_path = os.path.join(qc_path, "overrepresented.fa")
        filtered_reads_path = os.path.join(output_results_path, "filtered_reads")
        expression_path = os.path.join(output_results_path, "expression")
#        kallisto_path = os.path.join(expression_path, "kallisto")
        star_path = os.path.join(expression_path, "star")
        fusion_path = os.path.join(output_results_path, "fusion")
        mapsplice_path = os.path.join(fusion_path, "mapsplice")
        pizzly_path = os.path.join(fusion_path, "pizzly")
        fusioncatcher_path = os.path.join(fusion_path, "fusioncatcher")
        starfusion_path = os.path.join(fusion_path, "starfusion")
        starchip_path = os.path.join(fusion_path, "starchip")
        infusion_path = os.path.join(fusion_path, "infusion")
        soapfuse_path = os.path.join(fusion_path, "soapfuse")
        fetchdata_path = os.path.join(self.working_dir, "Sample_{}".format(sample_id), "fetchdata")
        fastqc_1 = os.path.join(qc_path, os.path.basename(fq1).rstrip(".fastq.gz") + "_fastqc", "fastqc_data.txt")
        fastqc_2 = os.path.join(qc_path, os.path.basename(fq2).rstrip(".fastq.gz") + "_fastqc", "fastqc_data.txt")


        for folder in [
                output_results_path, 
                qc_path, 
                skewer_path, 
                filtered_reads_path,
                expression_path, 
#                kallisto_path, 
                star_path,
                fusion_path, 
                mapsplice_path, 
#                pizzly_path, 
                fusioncatcher_path, 
                starfusion_path, 
#                starchip_path, 
                infusion_path, 
                soapfuse_path,
                fetchdata_path
            ]:
            IOMethods.create_folder(folder, self.logger)

        # get a list of tools from the samples.db file that have been run previously on this sample
        state_tools = self.samples.get_tool_list_from_state(sample_id)
        # get a list of tools from the config file which shall be run on this sample
        tools = cfg.tools
        cmds = cfg.commands
        module_dir = cfg.module_dir
        # Define cmd strings for each program
        # urla: mapsplice requires gunzip'd read files and process substitutions don't seem to work in slurm scripts...
        #       process substitution do somehow not work from this script - c/p the command line to the terminal, however, works w/o issues?!
        cmd_fastqc = "{0} --nogroup --extract -t 6 -o {1} {2} {3}".format(cmds["fastqc"], qc_path, fq1, fq2)
        cmd_qc_parser = "{0} -i {1} {2} -o {3}".format(os.path.join(module_dir, "misc", "qc_parser.py"), fastqc_1, fastqc_2, qc_table_path)
        cmd_skewer = "{0} -q {1} -i {2} {3} -o {4}".format(os.path.join(module_dir, "tool_wrapper", "skewer_wrapper.py"), qc_table_path, fq1, fq2, skewer_path)

        fq0 = ""
        if "QC" in tools:
            fq0 = os.path.join(skewer_path, "out_file-trimmed.fastq.gz")
            fq1 = os.path.join(skewer_path, "out_file-trimmed-pair1.fastq.gz")
            fq2 = os.path.join(skewer_path, "out_file-trimmed-pair2.fastq.gz")
        else:
            qc_table_path = "None"

        # (0) Readfilter
        cmd_star_filter = "{0} --genomeDir {1} --outFileNamePrefix {2}_ --readFilesCommand zcat --readFilesIn {3} {4} --outFilterMultimapNmax 100 --outSAMmultNmax 1 --chimSegmentMin 10 --chimJunctionOverhangMin 10 --alignSJDBoverhangMin 10 --alignMatesGapMax {5} --alignIntronMax {5} --chimSegmentReadGapMax 3 --alignSJstitchMismatchNmax 5 -1 5 5 --seedSearchStartLmax 20 --winAnchorMultimapNmax 50 --outSAMtype BAM Unsorted --chimOutType Junctions WithinBAM --outSAMunmapped Within KeepPairs --runThreadN waiting_for_cpu_number".format(cmds["star"], star_index_path, os.path.join(filtered_reads_path, sample_id), fq1, fq2, cfg.max_dist_proper_pair)
        cmd_read_filter = "{0} --input {1}_Aligned.out.bam --output {1}_Aligned.out.filtered.bam".format(os.path.join(module_dir, "fusionreadfilter.py"), os.path.join(filtered_reads_path, sample_id))
        # re-define fastq's if filtering is on (default)
        fq0 = ""
        if "Readfilter" in tools:
            fq0 = os.path.join(filtered_reads_path, os.path.basename(fq1).replace("R1", "R0").replace(".fastq.gz", "_filtered_singles.fastq.gz"))
            fq1 = os.path.join(filtered_reads_path, os.path.basename(fq1).replace(".fastq.gz", "_filtered.fastq.gz"))
            fq2 = os.path.join(filtered_reads_path, os.path.basename(fq2).replace(".fastq.gz", "_filtered.fastq.gz"))

        cmd_bam_to_fastq = "{0} fastq -0 {1} -1 {2} -2 {3} --threads waiting_for_cpu_number {4}_Aligned.out.filtered.bam".format(cmds["samtools"], fq0, fq1, fq2, os.path.join(filtered_reads_path, sample_id))
        # (1) Kallisto expression quantification (required for pizzly)
#        cmd_kallisto = "{0} quant --threads waiting_for_cpu_number --genomebam --gtf {1} --chromosomes {2} --index {3} --fusion --output-dir waiting_for_output_string {4} {5}".format(cmds["kallisto"], genes_gtf_path, genome_sizes_path, kallisto_index_path, fq1, fq2)
        # (2) Star expression quantification (required for starfusion and starchip)
        cmd_star = "{0} --genomeDir {1} --outFileNamePrefix waiting_for_output_string --runThreadN waiting_for_cpu_number --runMode alignReads --readFilesIn {2} {3} --readFilesCommand zcat --chimSegmentMin 10 --chimJunctionOverhangMin 10 --alignSJDBoverhangMin 10 --alignMatesGapMax {4} --alignIntronMax {4} --chimSegmentReadGapMax 3 --alignSJstitchMismatchNmax 5 -1 5 5 --seedSearchStartLmax 20 --winAnchorMultimapNmax 50 --outSAMtype BAM SortedByCoordinate --chimOutType Junctions SeparateSAMold --chimOutJunctionFormat 1".format(cmds["star"], star_index_path, fq1, fq2, cfg.max_dist_proper_pair)
        # (3) Mapslice
        # urla: the "keep" parameter requires gunzip >= 1.6
        cmd_extr_fastq1 = "gunzip --keep {0}".format(fq1)
        cmd_extr_fastq2 = "gunzip --keep {0}".format(fq2)
        # Added python interpreter to circumvent external hardcoded shell script
        cmd_mapsplice = "{0} --chromosome-dir {1} -x {2} -1 {3} -2 {4} --threads waiting_for_cpu_number --output {5} --qual-scale phred33 --bam --seglen 20 --min-map-len 40 --gene-gtf {6} --fusion".format(cmds["mapsplice"], genome_chrs_path, bowtie_index_path, fq1[:-3], fq2[:-3], mapsplice_path, genes_gtf_path)
        # (4) Fusiocatcher
        cmd_fusioncatcher = "{0} --input {1} --data {2} --output {3} -p waiting_for_cpu_number".format(cmds["fusioncatcher"], ",".join([fq1, fq2]), fusioncatcher_index_path, fusioncatcher_path)
        # star-fusion and star-chip can be run upon a previous star run (this MUST NOT be the star_filter run, but the star_expression run)
        # (5)
        cmd_starfusion = "{0} --chimeric_junction {1} --genome_lib_dir {2} --CPU waiting_for_cpu_number --output_dir {3}".format(cmds["starfusion"], "{}_Chimeric.out.junction".format(os.path.join(star_path, sample_id)), starfusion_index_path, starfusion_path)
        # (7)
#        cmd_starchip = "{0} {1} {2} {3}".format(cmds["starchip"], os.path.join(starchip_path, "starchip"), "{}_Chimeric.out.junction".format(os.path.join(star_path, sample_id)), starchip_param_path)
        # (6) Pizzly
#        cmd_pizzly = "{0} -k 29 --gtf {1} --cache {2} --fasta {3} --output {4} {5}".format(cmds["pizzly"], genes_gtf_path, pizzly_cache_path, genes_fasta_path, os.path.join(pizzly_path, "kallizzy"), os.path.join(kallisto_path, "fusion.txt"))
#        cmd_pizzly2 = "{0} {1} {2}".format(cmds["pizzly_cmd2"], "{}.json".format(os.path.join(pizzly_path, "kallizzy")), "{}.json.txt".format(os.path.join(pizzly_path, "kallizzy")))
        # (8) Infusion
        cmd_infusion = "{0} -1 {1} -2 {2} --skip-finished --min-unique-alignment-rate 0 --min-unique-split-reads 0 --allow-non-coding --out-dir {3} {4}".format(cmds["infusion"], fq1, fq2, infusion_path, infusion_cfg_path)
        # (x) Soapfuse
        cmd_soapfuse = "{0} -q {1} -i {2} -o {3}".format(os.path.join(module_dir, "tool_wrapper", "soapfuse_wrapper.py"), qc_table_path, " ".join([fq1, fq2]), soapfuse_path)
        # (9) Data collection
        cmd_fetchdata = "{0} -i {1} -o {2} -s {3} --fq1 {4} --fq2 {5} --fusion_support {6}".format(os.path.join(module_dir, "fetchdata.py"), output_results_path, fetchdata_path, sample_id, fq1, fq2, tool_num_cutoff)
        # (10) De novo assembly of fusion transcripts
        # urla: This is currently still under active development and has not been tested thoroughly
#        cmd_denovoassembly = "{0} -i waiting_for_gene_list_input -b {1}_Aligned.out.bam -g {2} -t {3} -o waiting_for_assembly_out_dir".format(os.path.join(module_dir, "denovoassembly.py"), os.path.join(filtered_reads_path, sample_id), ref_genome, ref_trans)
        # (X) Sample monitoring
        cmd_samples = "{0} --db_path={1} --sample_id={2} --action=append_state --tool=".format(os.path.join(module_dir, "misc", "samples.py"), self.samples.db_path, sample_id)

        # set final lists of executable tools and path
        exe_tools = [
            "QC", #0
            "Readfilter", #1
#            "Kallisto", #2
            "Star", #3
            "Mapsplice", #4
            "Fusioncatcher", #5
            "Starfusion", #6
#            "Pizzly", #7
#            "Starchip", #8
            "Infusion", #9
            "Soapfuse", #10
            "Fetchdata" #11
#            "Assembly" #12
            ]
        exe_cmds = [
            " && ".join([cmd_fastqc, cmd_qc_parser, cmd_skewer]), #0
            " && ".join([cmd_star_filter, cmd_read_filter, cmd_bam_to_fastq]), #1
#            cmd_kallisto, #2
            cmd_star, #3
            " && ".join([cmd_extr_fastq1, cmd_extr_fastq2, cmd_mapsplice]), #4
            cmd_fusioncatcher, #5
            cmd_starfusion, #6
#            " && ".join([cmd_pizzly, cmd_pizzly2]), #7
#            cmd_starchip, #8
            cmd_infusion, #9
            cmd_soapfuse, #10
            cmd_fetchdata #11
#            cmd_denovoassembly #12
            ]
        exe_path = [
            qc_path, #0
            filtered_reads_path, #1
#            kallisto_path, #2
            star_path, #3
            mapsplice_path, #4
            fusioncatcher_path, #5
            starfusion_path, #6
#            pizzly_path, #7
#            starchip_path, #8
            infusion_path, #9
            soapfuse_path, #10
            fetchdata_path #11
#            "" #12
            ]

        # create and submit slurm job if the tool is requested and hasn't been run before
        for i, tool in enumerate(exe_tools, 0):
            if tool in tools:
                dependency = []
                # check dependencies of the pipeline.
                # Besides tool dependencies (Pizzly -> Kallisto, Starfusion/Starchip -> Star), read filtering is mandatory
                # Processing will be skipped if a certain dependency was not found (either pre-processed data of the configs tool string are checked)
                if tool in state_tools:
                    # urla: the primary idea behind this flag is to allow multiple fetchdata executions during processing
                    #       nevertheless, re-processing of the same data with a newer version of a tool will also be straightforward (but overwriting previous results, of course)
#                    if self.overwrite:
#                        self.logger.info("Executing {0} although it looks like a previous run finished successfully. Results in {1} may be overwritten".format(tool, exe_path[i]))
#                    else:
                    self.logger.info("Skipping {0} as it looks like a previous run finished successfully. Results should be in {1}".format(tool, exe_path[i]))
                    continue
                else:
                    if tool == "Readfilter" and "Readfilter" not in tools:
                        self.logger.error(
                                """Error 99: Sample {} will be skipped due to missing read filtering.\n
                                Read filtering is currently a mandatory step for the processing.\n
                                Because you haven't run it before for this sample, you have to include \"Readfilter\" in the tool selection in your config.\n
                                """.format(sample_id))
                        print("Error 99: Sample {} will be skipped due to missing read filtering.".format(sample_id))
                        return 0
                    elif tool == "Pizzly" and "Kallisto" not in tools:
                        self.logger.error(
                                """Error 99: Running {0} for Sample {1} will be skipped due to a missing dependency.\n
                                Pizzly builds on Kallisto and it is therefore mandatory to run this first.\n
                                Because you haven't run it before for this sample, you have to include \"Kallisto\" in the tool selection in your config.\n
                                """.format(sample_id))
                        print("Error 99: Running {0} for Sample {1} will be skipped due to a missing dependency.".format(tool, sample_id))
                        continue
                    elif (tool == "Starfusion" or tool == "Starchip") and "Star" not in tools:
                        self.logger.error(
                                """Error 99: Running {0} for Sample {1} will be skipped due to a missing dependency.\n
                                {0} builds on Star and it is therefore mandatory to run this first.\n
                                Because you haven't run it before for this sample, you have to include \"Star\" in the tool selection in your config.\n
                                """.format(sample_id))
                        print("Error 99: Running {0} for Sample {1} will be skipped due to a missing dependency.".format(tool, sample_id))
                        continue

                # prepare slurm jobs: get ressources, create uid, set output path and check dependencies
                self.logger.debug("Submitting {} run to slurm".format(tool))
                cpu = cfg.resources[tool.lower()]["cpu"]
                mem = cfg.resources[tool.lower()]["mem"]
                uid = "-".join([cfg.pipeline_name, tool, sample_id])
                if tool == "Star":
                    exe_cmds[i] = exe_cmds[i].replace("waiting_for_output_string", "{}_".format(os.path.join(exe_path[i], sample_id))).replace("waiting_for_cpu_number", str(cpu))
                else:
                    exe_cmds[i] = exe_cmds[i].replace("waiting_for_output_string", exe_path[i]).replace("waiting_for_cpu_number", str(cpu))
                cmd = " && ".join([exe_cmds[i], cmd_samples + tool])
                # Managing slurm dependencies
                que_sys = cfg.queueing_system
                if tool == "Pizzly":
                    dependency = Queueing.get_jobs_by_name("Kallisto-{0}".format(sample_id), que_sys)
                elif tool == "Starfusion" or tool == "Starchip":
                    dependency = Queueing.get_jobs_by_name("Star-{0}".format(sample_id), que_sys)
                elif tool == "Fetchdata":
                    dependency = Queueing.get_jobs_by_name(sample_id, que_sys)
                elif tool == "Assembly":
                    dependency = Queueing.get_jobs_by_name("Fetchdata-{0}".format(sample_id), que_sys)
                elif tool == "ReadFilter":
                    dependency = Queueing.get_jobs_by_name("QC-{0}".format(sample_id), que_sys)
                dependency.extend(Queueing.get_jobs_by_name("Readfilter-{0}".format(sample_id), que_sys))
                dependency.extend(Queueing.get_jobs_by_name("QC-{0}".format(sample_id), que_sys))
                self.logger.debug("Submitting slurm job: CMD - {0}; PATH - {1}; DEPS - {2}".format(cmd, exe_path[i], dependency))
                self.submit_job(uid, cmd, cpu, mem, exe_path[i], dependency, "")
            else:
                self.logger.info("Skipping {0} as it is not selected for execution (Selected are: {1})".format(tool, tools))

    def submit_job(self, uid, cmd, cores, mem_usage, output_results_folder, dependencies, mail):
        """Submit job to slurm scheduling"""
        que_sys = cfg.queueing_system
        already_running = Queueing.get_jobs_by_name(uid, que_sys)
        if not already_running:
            # urla: for compatibility reasons (and to be independent of shell commands), concatenated commands are splitted again,
            #       dependencies within the splitted groups updated and everything submitted sequentially to the queueing system
            module_file = os.path.join(cfg.module_dir, "build_env.sh")

            for i, cmd_split in enumerate(cmd.split(" && ")):
                if not que_sys in ["slurm", "pbs"]:
                    cmd_split = cmd_split.split(" ")
                dependencies.extend(Queueing.get_jobs_by_name("{0}_CMD{1}".format(uid, i - 1), que_sys))
                Queueing.submit("{0}_CMD{1}".format(uid, i), cmd_split, cores, mem_usage, output_results_folder, dependencies, cfg.partition, cfg.user, cfg.time_limit, mail, module_file, que_sys)
                time.sleep(0.5)
        else:
            self.logger.error("A job with this application/sample combination is currently running. Skipping {} in order to avoid unintended data loss.".format(uid))
Example #12
0
class Fetching(object):
    """Run, monitor and schedule fastq processing for fusion gene prediction"""
    def __init__(self, scratch_path, fetchdata_path, sample_id):
        """Parameter initiation and work folder creation."""
        self.scratch_path = scratch_path
        self.fetchdata_path = fetchdata_path
        self.sample_id = sample_id
        #self.tools = Samples(os.path.join(scratch_path, os.path.pardir, os.path.pardir, "samples.csv")).get_tool_list_from_state(self.sample_id)
        self.samples = SamplesDB(
            os.path.join(scratch_path, os.path.pardir, "samples.db"))
        self.logger = Logger(os.path.join(self.fetchdata_path,
                                          "fetchdata.log"))

    def get_pseudo_genome_adjustments_for_star(self, num_len_file):  # wrong pylint error due to long name => pylint: disable=C0103
        """Return the genome size of an associated fasta file calculated by urla_GetFusionSequence_latest.R"""
        seq_num = 0
        genome_size = 0
        with open(num_len_file) as lfile:
            seq_num = int(lfile.next())
            genome_size = int(lfile.next())
        star_genome_chr_bin_n_bits = min(
            18, int(math.log(genome_size / seq_num, 2)))
        star_genome_sa_index_n_bases = min(
            14, int(math.log(genome_size, 2) / 2 - 1)) - 2
        self.logger.debug(
            "Custom genome sequence number: {0} => {1} will be used as bin size parameter for genome storage"
            .format(seq_num, star_genome_chr_bin_n_bits))
        self.logger.debug(
            "Custom Genome Size: {0} bp => {1} will be used as length parameter for SA pre-indexing"
            .format(genome_size, star_genome_sa_index_n_bases))
        return (str(star_genome_chr_bin_n_bits),
                str(star_genome_sa_index_n_bases))

    @staticmethod
    def get_input_read_count_from_star(star_out_bam):
        """Parses a star output log file to get input read counts from the fastq origin"""
        log_file = "{}Log.final.out".format(
            star_out_bam.rstrip("Aligned.out.bam"))
        if not os.path.exists(log_file):
            return -1
        with open(log_file, "r") as star_log:
            for line in star_log:
                if line.split("|")[0].strip() == "Number of input reads":
                    return int(line.split("|")[1].strip())
        return -1

    @staticmethod
    def get_input_read_count_from_fastq(fastq):
        """Parses input FASTQ to get read count"""
        ps = subprocess.Popen(("zcat", fastq), stdout=subprocess.PIPE)
        result = subprocess.check_output(("wc", "-l"), stdin=ps.stdout)
        return int(result) / 4

    def run(self, fusion_support, fq1, fq2):
        """Identification of fastq files and initiation of processing"""
        # print sample id
        # execute processing pipe
        # sampleID = ...
        self.logger.info("Fetching in sample {}".format(self.sample_id))
        if not fq1 or not fq2:
            self.logger.debug(
                "Either ReadFile 1 or 2 or both are missing, trying to get original files from samples.csv"
            )
            self.logger.debug(self.sample_id)
            self.logger.debug(self.samples.db_path)
            (fq1, fq2) = self.samples.get_fastq_files(self.sample_id)
        self.execute_pipeline(fq1, fq2, fusion_support)

    # urla: there are a lot of local variables declarated in the following method.
    #       Although this could be reduced quite strongly, readability would be strongly reduced as well
    #       pylint:disable=R0914
    def execute_pipeline(self, fq1, fq2, fusion_support):
        """Create sample specific subfolder structuge and run tools on fastq files"""

        # Genome/Gene references to use
        ref_trans = cfg.ref_trans_version
        ref_genome = cfg.ref_genome_build
        genome_fasta_path = cfg.references["genome_fasta"]
        genes_adb_path = cfg.references["genes_adb"]
        genes_tsl_path = cfg.references["genes_tsl"]

        fetchdata_current_path = os.path.join(
            self.fetchdata_path, "fd_{}_tool".format(fusion_support))
        detected_fusions_path = os.path.join(fetchdata_current_path,
                                             "fetched_fusions")
        detected_fusions_file = os.path.join(detected_fusions_path,
                                             "Detected_Fusions.csv")
        context_seq_path = os.path.join(fetchdata_current_path,
                                        "fetched_contextseqs")
        context_seq_file = os.path.join(context_seq_path, "Context_Seqs.csv")
        filtered_reads_path = os.path.join(self.scratch_path, "filtered_reads")
        star_genome_path = os.path.join(context_seq_path, "STAR_idx")
        star_align_path = os.path.join(context_seq_path, "STAR_align")
        star_align_file = os.path.join(star_align_path,
                                       "{}_".format(self.sample_id))
        classification_path = os.path.join(fetchdata_current_path,
                                           "classification")
        classification_file = os.path.join(classification_path,
                                           "classification")

        for folder in [
                fetchdata_current_path, detected_fusions_path,
                context_seq_path, star_genome_path, star_align_path,
                classification_path
        ]:
            IOMethods.create_folder(folder, self.logger)

        # processing steps to perform
        tools = cfg.fd_tools
        fusion_tools = cfg.tools
        module_dir = cfg.module_dir
        cmds = cfg.commands
        # In case of a liftover, some reference and path must be changed accordingly
        cmd_contextseq_org = ""
        if "Liftover" in tools:
            tools.insert(2, "ContextSeqBak")
            # for read grepping, we need the original reference on which the first mapping was performed
            cmd_contextseq_org = "python {0} --detected_fusions {1}.bak --annotation_db {2} --out_csv {3}.bak --genome_fasta {4} --tsl_info {5} --cis_near_dist {6} --context_seq_len {7} --tsl_filter_level {8}".format(
                os.path.join(module_dir, "fusionannotation.py"),
                detected_fusions_file, genes_adb_path, context_seq_file,
                genome_fasta_path, genes_tsl_path, cfg.cis_near_distance,
                cfg.context_seq_len, cfg.tsl_filter)
            # now, references need to be updated according to the target liftover
            crossmap_chain = cfg.liftover["crossmap_chain"]
            ref_genome_dest = os.path.basename(crossmap_chain).replace(
                ".", "_").split("_")[2].lower()
            self.logger.debug(
                "Creating a copy of the detected fusions file due to selection of liftover. Old ({0}) data will be kept in \"{1}.bak\""
                .format(ref_genome, detected_fusions_file))
            genome_fasta_path = cfg.references["genome_fasta_hg37"]
            genes_adb_path = cfg.references["genes_adb_hg37"]

        # urla - note: tmp hack to get original star input reads for normalization
        with open(
                os.path.join(classification_path, "Star_org_input_reads.txt"),
                "w") as infile:
            read_count = self.get_input_read_count_from_star(
                os.path.join(filtered_reads_path,
                             "{}_Aligned.out.bam".format(self.sample_id)))
            if read_count == -1:
                read_count = self.get_input_read_count_from_fastq(fq1)
            infile.write(str(read_count))
        # Define cmd strings for each program
        cmd_fusiondata = "{0} -i {1} -o {2} -s {3} -t {4} -f {5} -l {6}".format(
            os.path.join(module_dir, "fusiontoolparser.py"), self.scratch_path,
            detected_fusions_path, self.sample_id, fusion_support,
            ",".join(cfg.fusiontools), self.logger.get_path())
        cmd_liftover = "{0} -i {1} -l {2}".format(
            os.path.join(module_dir, "misc", "liftover.py"),
            detected_fusions_file, self.logger.get_path())
        cmd_contextseq = "{0} --detected_fusions {1} --annotation_db {2} --out_csv {3} --genome_fasta {4} --tsl_info {5} --cis_near_dist {6} --context_seq_len {7} --tsl_filter_level {8}".format(
            os.path.join(module_dir, "fusionannotation.py"),
            detected_fusions_file, genes_adb_path, context_seq_file,
            genome_fasta_path, genes_tsl_path, cfg.cis_near_distance,
            cfg.context_seq_len, cfg.tsl_filter)
        cpu = cfg.resources["fetchdata"]["cpu"]
        #        mem = cfg.resources["fetchdata"]["mem"]
        cmd_starindex = "{0} --runMode genomeGenerate --runThreadN {1} --limitGenomeGenerateRAM 48000000000 --genomeChrBinNbits waiting_for_bin_size_input --genomeSAindexNbases waiting_for_sa_idx_input --genomeDir {2} --genomeFastaFiles {3}".format(
            cmds["star"], cpu, star_genome_path,
            "{0}{1}".format(context_seq_file, ".fasta"))
        cmd_staralign_fltr = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}fltr_ --limitBAMsortRAM 48000000000".format(
            cmds["star"], star_genome_path, fq1, fq2, cpu, star_align_file)
        cmd_bamindex_fltr = "{0} index {1}fltr_Aligned.sortedByCoord.out.bam".format(
            cmds["samtools"], star_align_file)
        cmd_requantify_fltr = "{0} -i {1}fltr_Aligned.sortedByCoord.out.bam -o {2}_fltr.tdt -d 10".format(
            os.path.join(module_dir, "requantify.py"), star_align_file,
            classification_file)
        (fq1, fq2) = self.samples.get_fastq_files(self.sample_id)
        cmd_staralign_org = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}org_ --limitBAMsortRAM 48000000000".format(
            cmds["star"], star_genome_path, fq1, fq2, cpu, star_align_file)
        cmd_bamindex_org = "{0} index {1}org_Aligned.sortedByCoord.out.bam".format(
            cmds["samtools"], star_align_file)
        cmd_requantify_org = "{0} -i {1}org_Aligned.sortedByCoord.out.bam -o {2}_org.tdt -d 10".format(
            os.path.join(module_dir, "requantify.py"), star_align_file,
            classification_file)
        # for testing, based on debug. should be removed if merged to original
        cmd_read_filter2 = "{0} --input {1}_Aligned.out.bam --input2 {2}.debug --output {1}_Aligned.out.filtered2.bam".format(
            os.path.join(module_dir, "getRequantReads.py"),
            os.path.join(filtered_reads_path, self.sample_id),
            context_seq_file)
        # re-define fastq's if filtering is on (default)
        fq0 = ""
        if "Readfilter" in fusion_tools:
            fq0 = os.path.join(
                filtered_reads_path,
                os.path.basename(fq1).replace("R1", "R0").replace(
                    ".fastq.gz", "_filtered2_singles.fastq.gz"))
            fq1 = os.path.join(
                filtered_reads_path,
                os.path.basename(fq1).replace(".fastq.gz",
                                              "_filtered2.fastq.gz"))
            fq2 = os.path.join(
                filtered_reads_path,
                os.path.basename(fq2).replace(".fastq.gz",
                                              "_filtered2.fastq.gz"))
        cmd_bam_to_fastq = "{0} fastq -0 {1} -1 {2} -2 {3} --threads {5} {4}_Aligned.out.filtered2.bam".format(
            cmds["samtools"], fq0, fq1, fq2,
            os.path.join(filtered_reads_path, self.sample_id), cpu)
        # allow soft-clipping? Specificity? --alignEndsType EndToEnd
        cmd_staralign_best = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}best_ --limitBAMsortRAM 48000000000".format(
            cmds["star"], star_genome_path, fq1, fq2, cpu, star_align_file)
        cmd_bamindex_best = "{0} index {1}best_Aligned.sortedByCoord.out.bam".format(
            cmds["samtools"], star_align_file)
        cmd_requantify_best = "{0} -i {1}best_Aligned.sortedByCoord.out.bam -o {2}_best.tdt -d 10".format(
            os.path.join(module_dir, "requantify.py"), star_align_file,
            classification_file)

        # set final lists of executable tools and path
        exe_tools = [
            "Fusiongrep",  #1
            "Liftover",  #2
            "ContextSeqBak",
            "Contextseq",  #3
            "Starindex",  #4
            "StaralignFltr",  #5
            "BamindexFltr",  #6
            "RequantifyFltr",  #7
            "StaralignOrg",  #8
            "BamindexOrg",  #9
            "RequantifyOrg",  #10
            "ReadFilter2",  #11
            "ReadFilter2b",  #12
            "StaralignBest",  #13
            "BamindexBest",  #14
            "RequantifyBest"  #15
        ]
        exe_cmds = [
            cmd_fusiondata,  #1
            cmd_liftover,  #2
            cmd_contextseq_org,
            cmd_contextseq,  #3
            cmd_starindex,  #4
            cmd_staralign_fltr,  #5
            cmd_bamindex_fltr,  #6
            cmd_requantify_fltr,  #7
            cmd_staralign_org,  #8
            cmd_bamindex_org,  #9
            cmd_requantify_org,  #10
            cmd_read_filter2,  #11
            cmd_bam_to_fastq,  #12
            cmd_staralign_best,  #13
            cmd_bamindex_best,  #14
            cmd_requantify_best  #15
        ]
        exe_dependencies = [
            "",  #1
            detected_fusions_file,  #2
            detected_fusions_file,
            detected_fusions_file,  #3
            "{0}{1}".format(context_seq_file, ".fasta.info"),  #4
            star_genome_path,  #5
            "{}fltr_Aligned.sortedByCoord.out.bam".format(star_align_file),  #6
            "",  #7
            star_genome_path,  #8
            "{}org_Aligned.sortedByCoord.out.bam".format(star_align_file),  #9
            "",  #10
            "",  #11
            "",  #12
            star_genome_path,  #13
            "{}best_Aligned.sortedByCoord.out.bam".format(
                star_align_file),  #14
            ""  #15
        ]

        # create and submit slurm job if the tool is requested and hasn't been run before
        module_file = os.path.join(module_dir, "build_env.sh")
        for i, tool in enumerate(exe_tools, 0):
            if tool in tools:
                if not exe_dependencies[i] or os.path.exists(
                        exe_dependencies[i]):
                    self.logger.info("Starting {}".format(tool))
                    if tool == "Starindex":  # the genome size required for the genomeSAindexNbases parameter is not known before now
                        (star_bin, star_sa
                         ) = self.get_pseudo_genome_adjustments_for_star(
                             "{0}{1}".format(context_seq_file, ".fasta.info"))
                        exe_cmds[i] = exe_cmds[i].replace(
                            "waiting_for_bin_size_input", star_bin)
                        exe_cmds[i] = exe_cmds[i].replace(
                            "waiting_for_sa_idx_input", star_sa)
                    self.logger.debug("Executing: {}".format(exe_cmds[i]))
                    Queueing.submit("", exe_cmds[i].split(" "), "", "", "", "",
                                    "", "", "", "", module_file, "none")
                else:
                    self.logger.error(
                        "Could not run {0} due to the missing dependency {1}".
                        format(tool, exe_dependencies[i]))
                    sys.exit(1)
            else:
                self.logger.debug(
                    "Skipping {0} as it is not selected for execution (Selected are: {1})"
                    .format(tool, tools))
Example #13
0
        '%d_%m_%Y_%H_%M_%S')

    parser = argparse.ArgumentParser(description='mine')
    parser.add_argument('--config',
                        type=str,
                        default='./configs/mine.yml',
                        help='Path to config file')
    opts = parser.parse_args()
    params = get_config(opts.config)
    print(params)

    model = Mine(params)
    if params['use_cuda']:
        model = model.cuda()

    if params['training'] == True and params['visualize'] == False:
        exp_logs = params['logs'] + params['exp_name'] + '_' + timestamp + '/'
        exp_results = params['results'] + params[
            'exp_name'] + '_' + timestamp + '/'
        mkdir_p(exp_logs)
        mkdir_p(exp_results)

        config_logfile = exp_logs + 'config.json'
        with open(config_logfile, 'w+') as cf:
            json.dump(params, cf)

        optimizer = optim.Adam(model.parameters(), lr=params['lr'])
        logger = Logger(exp_logs)

        train(params)
Example #14
0
    model = DeepCross(opt=opt)
    model = model.cuda()

    if opt.loader:
        print("load checkpoint file .")
        model.load_state_dict(
            torch.load(os.path.join('models', 'model-1.ckpt')))

    current_lr = 1e-3
    optimizer = optim.Adam(model.parameters(), lr=current_lr)

    # criterion = nn.BCEWithLogitsLoss()
    criterion = FocalLoss()
    # criterion = nn.BCELoss()
    logger = Logger('./logs/')

    for epoch in range(2, opt.num_epoches):
        # schedule learning rate
        frac = epoch // 2
        decay_factor = 0.9**frac
        current_lr = current_lr * decay_factor
        utils.set_lr(optimizer, current_lr)

        # training
        model.train()
        start = time.time()

        for i, data in enumerate(train_loader):
            # prepare data and corresponding label(which is 'click')
            user_id = data['user_id'].cuda()
Example #15
0
def main():
    mylog = Logger(os.path.join(os.path.abspath(os.path.curdir),'misc/spider_log.yaml'))
    logger = mylog.outputLog()
    items = spiderman()
    for item in items:
        send_mysql(item, logger)
Example #16
0
def train(model, vocab, cfg):
    seqtree_coco = SeqtreeCOCO()
    loader = DataLoader(seqtree_coco,
                        batch_size=16,
                        shuffle=True,
                        num_workers=4)
    logdir = os.path.join(cfg.checkpoint_path, cfg.id)
    if not os.path.isdir(logdir):
        os.mkdir(logdir)
    logger = Logger(logdir)

    with open(os.path.join(logdir, 'config.txt'), 'w') as f:
        f.write(str(cfg))
    with open('data/idx2caps.json', 'r') as f:
        cocoid2caps = json.load(f)
    cocoid2caps = {int(k): v for k, v in cocoid2caps.items()}
    init_scorer('coco-train-idxs')

    infos = {}
    # if cfg.start_from is not None:
    #     with open(os.path.join(cfg.start_from, 'infos_' + cfg.start_from + '_best.pkl'), 'rb') as f:
    #         infos = pickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
    val_result_history = infos.get('val_result_history', {})
    loss_history = infos.get('loss_history', {})
    lr_history = infos.get('lr_history', {})

    best_val_score = 0

    update_lr_flag = True
    if cfg.caption_model == 'att_model' or cfg.caption_model == 'tree_model' \
            or cfg.caption_model == 'tree_model_1' or cfg.caption_model == 'tree_model_md' \
            or cfg.caption_model == 'tree_model_2' or cfg.caption_model == 'tree_model_md_att' \
            or cfg.caption_model == 'tree_model_md_sob' or cfg.caption_model == 'tree_model_md_in' \
            or cfg.caption_model == 'drnn':
        # crit = nn.CrossEntropyLoss()
        crit = LanguageModelCriterion()
        rl_crit = RewardCriterion()
    else:
        raise Exception("Caption model not supported: {}".format(
            cfg.caption_model))

    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)

    num_period_best = 0
    current_score = 0
    start = time.time()

    print("start training...")

    while True:
        if update_lr_flag:
            if epoch > cfg.learning_rate_decay_start >= 0:
                frac = (epoch - cfg.learning_rate_decay_start
                        ) // cfg.learning_rate_decay_every
                decay_factor = cfg.learning_rate_decay_rate**frac
                cfg.current_lr = cfg.learning_rate * decay_factor
                utils.set_lr(optimizer, cfg.current_lr)
            else:
                cfg.current_lr = cfg.learning_rate

        optimizer.zero_grad()
        for data in loader:
            if cfg.use_cuda:
                torch.cuda.synchronize()

            if cfg.caption_model == 'tree_model_md_att':
                temp = [
                    data['word_idx'], data['father_idx'], data['masks'],
                    data['fc_feats'], data['att_feats']
                ]
                temp = [_.cuda() for _ in temp]
                word_idx, father_idx, masks, fc_feats, att_feats = temp

            elif cfg.caption_model == 'tree_model_md' or cfg.caption_model == 'tree_model_md_sob' \
                    or cfg.caption_model == 'tree_model_md_in' or cfg.caption_model == 'drnn':
                temp = [
                    data['word_idx'], data['father_idx'], data['masks'],
                    data['fc_feats']
                ]
                temp = [_.cuda() for _ in temp]
                word_idx, father_idx, masks, fc_feats = temp
                # words = [[vocab.idx2word[word_idx[batch_index][i].item()] for i in range(40)]
                #          for batch_index in range(2)]

            else:
                raise Exception("Caption model not supported: {}".format(
                    cfg.caption_model))

            optimizer.zero_grad()
            # if cfg.caption_model == 'tree_model_md_att':
            #     logprobs = model(word_idx, father_idx, fc_feats, att_feats)
            #     loss = crit(logprobs, word_idx, masks)
            if cfg.caption_model == 'tree_model_md' or cfg.caption_model == 'tree_model_md_sob' \
                    or cfg.caption_model == 'tree_model_md_in' or cfg.caption_model == 'drnn' \
                    or cfg.caption_model == 'tree_model_md_att':
                word_idx, father_idx, mask, seqLogprobs = model._sample(
                    fc_feats, att_feats, max_seq_length=40)
                gen_result = utils.decode_sequence(vocab, word_idx, father_idx,
                                                   mask)
                ratio = utils.seq2ratio(word_idx, father_idx, mask)
                reward = get_self_critical_reward(model, fc_feats, att_feats,
                                                  data, gen_result,
                                                  vocab, cocoid2caps,
                                                  word_idx.size(1), cfg)
                loss = rl_crit(seqLogprobs, mask,
                               torch.from_numpy(reward).float().cuda(), ratio)

            else:
                raise Exception("Caption model not supported: {}".format(
                    cfg.caption_model))

            loss.backward()
            utils.clip_gradient(optimizer, cfg.grad_clip)
            optimizer.step()
            train_loss = loss.item()

            if cfg.use_cuda:
                torch.cuda.synchronize()

            if iteration % cfg.losses_log_every == 0:
                end = time.time()
                logger.scalar_summary('train_loss', train_loss, iteration)
                logger.scalar_summary('learning_rate', cfg.learning_rate,
                                      iteration)
                loss_history[iteration] = train_loss
                lr_history[iteration] = cfg.current_lr
                print(
                    "iter {} (epoch {}), learning_rate: {:.6f}, train_loss: {:.6f}, current_cider: {:.3f}, best_cider: {:.3f}, time/log = {:.3f}" \
                        .format(iteration, epoch, cfg.current_lr, train_loss, current_score, best_val_score,
                                end - start))
                start = time.time()

            if (iteration + 1) % cfg.save_checkpoint_every == 0:
                eval_kwargs = {'eval_split': 'val', 'eval_time': False}
                eval_kwargs.update(vars(cfg))
                # lang_stats = eval_utils.eval_split(model, vocab, eval_kwargs)
                lang_stats = eval_seqtree.eval_split(model, vocab, eval_kwargs)
                if cfg.use_cuda:
                    model = model.cuda()

                for k, v in lang_stats.items():
                    logger.scalar_summary(k, v, iteration)

                val_result_history[iteration] = {'lang_stats': lang_stats}

                current_score = lang_stats['CIDEr']
                best_flag = False

                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                    num_period_best = 1
                else:
                    num_period_best += 1

                if best_flag:
                    infos['iter'] = iteration
                    infos['epoch'] = epoch
                    infos['val_result_history'] = val_result_history
                    infos['loss_history'] = loss_history
                    infos['lr_history'] = lr_history

                    checkpoint_path = os.path.join(
                        logdir, 'model_' + cfg.id + '_best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    optimizer_path = os.path.join(
                        logdir, 'optimizer_' + cfg.id + '_best.pth')
                    torch.save(optimizer.state_dict(), optimizer_path)
                    print("model saved to {}".format(logdir))
                    with open(
                            os.path.join(logdir,
                                         'infos_' + cfg.id + '_best.pkl'),
                            'wb') as f:
                        pickle.dump(infos, f)

                if num_period_best >= cfg.num_eval_no_improve:
                    print('no improvement, exit({})'.format(best_val_score))
                    sys.exit()

            iteration += 1

        epoch += 1
        if epoch >= cfg.max_epoches != -1:
            break
Example #17
0
    def __init__(self, opt):
        lopt = opt.logger
        topt = opt.trainer
        mopt = opt.model
        gopt = opt.model.gen
        copt = opt.model.crit
        goopt = opt.optim.gen
        coopt = opt.optim.crit

        #CUDA configuration
        if opt.device == 'cuda' and torch.cuda.is_available():
            os.environ['CUDA_VISIBLE_DEVICES'] = opt.deviceId
            torch.backends.cudnn.benchmark = True
        else:
            opt.device = 'cpu'

        self.device = torch.device(opt.device)

        #logger
        self.logger_ = Logger(self, gopt.latentSize, topt.resumeTraining,
                              opt.tick, opt.loops, lopt.logPath, lopt.logStep,
                              lopt.saveImageEvery, lopt.saveModelEvery,
                              lopt.logLevel, self.device)
        self.logger = self.logger_.logger

        #Logging configuration parameters
        if opt.device == 'cuda':
            num_gpus = len(opt.deviceId.split(','))
            self.logger.info("Using {} GPUs.".format(num_gpus))
            self.logger.info("Training on {}.\n".format(
                torch.cuda.get_device_name(0)))

        #data loader
        dlopt = opt.dataLoader

        self.dataLoader = DataLoader(dlopt.dataPath, dlopt.resolution,
                                     dlopt.noChannels, dlopt.batchSize,
                                     dlopt.numWorkers)

        self.resolution, self.nCh = self.dataLoader.resolution, self.dataLoader.nCh

        # training opt
        assert opt.tick > 0, self.logger.error(
            f'The number of ticks should be a positive integer, got {opt.tick} instead'
        )
        self.tick = float(opt.tick)

        assert opt.loops > 0, self.logger.error(
            f'The number of ticks should be a positive integer, got {opt.loops} instead'
        )
        self.loops = int(opt.loops)

        self.imShown = 0
        self.batchShown = self.imShown // self.dataLoader.batchSize

        assert topt.lossFunc in ['NSL', 'WD'], self.logger.error(
            f'The specified loss model is not supported. Please choose between "NSL" or "WD"'
        )
        self.lossFunc = topt.lossFunc
        self.criterion = NonSaturatingLoss if self.lossFunc == 'NSL' else WassersteinLoss

        self.applyLossScaling = bool(topt.applyLossScaling)

        self.paterm = topt.paterm
        self.lambg = float(topt.lambg)
        self.gLazyReg = max(topt.gLazyReg, 1)
        self.styleMixingProb = float(topt.styleMixingProb)

        self.meanPathLength = 0.

        self.plDecay = topt.meanPathLengthDecay

        self.pathRegWeight = topt.pathLengthRWeight

        assert topt.nCritPerGen > 0, self.logger.error(
            f'Trainer ERROR: The number of critic training loops per generator loop should be an integer >= 1 (got {topt.nCritPerGen})'
        )
        self.nCritPerGen = int(topt.nCritPerGen)

        self.lambR2 = float(topt.lambR2) if topt.lambR2 else 0  #lambda R2
        self.obj = float(topt.obj) if topt.obj else 1  #objective value (1-GP)

        self.lambR1 = float(topt.lambR1) if topt.lambR2 else 0  #lambda R1

        self.epsilon = float(
            topt.epsilon) if topt.epsilon else 0  #epsilon (drift loss)

        self.cLazyReg = max(topt.cLazyReg, 1)

        self.kUnroll = int(topt.unrollCritic) if topt.unrollCritic else 0

        assert self.kUnroll >= 0, self.logger.error(
            f'Trainer ERROR: The unroll parameter is less than zero ({self.kUnroll})'
        )

        #Common model parameters
        common = {
            'fmapMax': mopt.fmapMax,
            'fmapMin': mopt.fmapMin,
            'fmapDecay': mopt.fmapDecay,
            'fmapBase': mopt.fmapBase,
            'activation': mopt.activation,
            'upsample': mopt.sampleMode,
            'downsample': mopt.sampleMode
        }

        #Generator model parameters
        self.gen = Generator(**common, **gopt).to(self.device)
        self.latentSize = self.gen.mapping.latentSize

        self.logger.info(
            f'Generator constructed. Number of parameters {sum([np.prod([*p.size()]) for p in self.gen.parameters()])}'
        )

        #Critic model parameters
        self.crit = Critic(**mopt, **copt).to(self.device)

        self.logger.info(
            f'Critic constructed. Number of parameters {sum([np.prod([*p.size()]) for p in self.crit.parameters()])}'
        )

        #Generator optimizer parameters
        glr, beta1, beta2, epsilon, lrDecay, lrDecayEvery, lrWDecay = list(
            goopt.values())

        assert lrDecay >= 0 and lrDecay <= 1, self.logger.error(
            'Trainer ERROR: The decay constant for the learning rate of the generator must be a constant between [0, 1]'
        )
        assert lrWDecay >= 0 and lrWDecay <= 1, self.logger.error(
            'Trainer ERROR: The weight decay constant for the generator must be a constant between [0, 1]'
        )
        self.gOptimizer = Adam(filter(lambda p: p.requires_grad,
                                      self.gen.parameters()),
                               lr=glr,
                               betas=(beta1, beta2),
                               weight_decay=lrWDecay,
                               eps=epsilon)

        if lrDecayEvery and lrDecay:
            self.glrScheduler = lr_scheduler.StepLR(self.gOptimizer,
                                                    step_size=lrDecayEvery *
                                                    self.tick,
                                                    gamma=lrDecay)
        else:
            self.glrScheduler = None

        self.logger.info(f'Generator optimizer constructed')

        #Critic optimizer parameters
        clr, beta1, beta2, epsilon, lrDecay, lrDecayEvery, lrWDecay = list(
            coopt.values())

        assert lrDecay >= 0 and lrDecay <= 1, self.logger.error(
            'Trainer ERROR: The decay constant for the learning rate of the critic must be a constant between [0, 1]'
        )
        assert lrWDecay >= 0 and lrWDecay <= 1, self.logger.error(
            'Trainer ERROR: The weight decay constant for the critic must be a constant between [0, 1]'
        )

        self.cOptimizer = Adam(filter(lambda p: p.requires_grad,
                                      self.crit.parameters()),
                               lr=clr,
                               betas=(beta1, beta2),
                               weight_decay=lrWDecay,
                               eps=epsilon)

        if lrDecayEvery and lrDecay:
            self.clrScheduler = lr_scheduler.StepLR(self.gOptimizer,
                                                    step_size=lrDecayEvery *
                                                    self.tick,
                                                    gamma=lrDecay)
        else:
            self.clrScheduler = None

        self.logger.info(f'Critic optimizer constructed')

        self.preWtsFile = opt.preWtsFile
        self.resumeTraining = bool(topt.resumeTraining)
        self.loadPretrainedWts(resumeTraining=self.resumeTraining)

        self.logger.info(
            f'The trainer has been instantiated.... Starting step: {self.imShown}. Resolution: {self.resolution}'
        )

        self.logArchitecture(clr, glr)
Example #18
0
class Trainer:
    """
    Trainer class with hyperparams, log, train function etc.
    """
    def __init__(self, opt):
        lopt = opt.logger
        topt = opt.trainer
        mopt = opt.model
        gopt = opt.model.gen
        copt = opt.model.crit
        goopt = opt.optim.gen
        coopt = opt.optim.crit

        #CUDA configuration
        if opt.device == 'cuda' and torch.cuda.is_available():
            os.environ['CUDA_VISIBLE_DEVICES'] = opt.deviceId
            torch.backends.cudnn.benchmark = True
        else:
            opt.device = 'cpu'

        self.device = torch.device(opt.device)

        #logger
        self.logger_ = Logger(self, gopt.latentSize, topt.resumeTraining,
                              opt.tick, opt.loops, lopt.logPath, lopt.logStep,
                              lopt.saveImageEvery, lopt.saveModelEvery,
                              lopt.logLevel, self.device)
        self.logger = self.logger_.logger

        #Logging configuration parameters
        if opt.device == 'cuda':
            num_gpus = len(opt.deviceId.split(','))
            self.logger.info("Using {} GPUs.".format(num_gpus))
            self.logger.info("Training on {}.\n".format(
                torch.cuda.get_device_name(0)))

        #data loader
        dlopt = opt.dataLoader

        self.dataLoader = DataLoader(dlopt.dataPath, dlopt.resolution,
                                     dlopt.noChannels, dlopt.batchSize,
                                     dlopt.numWorkers)

        self.resolution, self.nCh = self.dataLoader.resolution, self.dataLoader.nCh

        # training opt
        assert opt.tick > 0, self.logger.error(
            f'The number of ticks should be a positive integer, got {opt.tick} instead'
        )
        self.tick = float(opt.tick)

        assert opt.loops > 0, self.logger.error(
            f'The number of ticks should be a positive integer, got {opt.loops} instead'
        )
        self.loops = int(opt.loops)

        self.imShown = 0
        self.batchShown = self.imShown // self.dataLoader.batchSize

        assert topt.lossFunc in ['NSL', 'WD'], self.logger.error(
            f'The specified loss model is not supported. Please choose between "NSL" or "WD"'
        )
        self.lossFunc = topt.lossFunc
        self.criterion = NonSaturatingLoss if self.lossFunc == 'NSL' else WassersteinLoss

        self.applyLossScaling = bool(topt.applyLossScaling)

        self.paterm = topt.paterm
        self.lambg = float(topt.lambg)
        self.gLazyReg = max(topt.gLazyReg, 1)
        self.styleMixingProb = float(topt.styleMixingProb)

        self.meanPathLength = 0.

        self.plDecay = topt.meanPathLengthDecay

        self.pathRegWeight = topt.pathLengthRWeight

        assert topt.nCritPerGen > 0, self.logger.error(
            f'Trainer ERROR: The number of critic training loops per generator loop should be an integer >= 1 (got {topt.nCritPerGen})'
        )
        self.nCritPerGen = int(topt.nCritPerGen)

        self.lambR2 = float(topt.lambR2) if topt.lambR2 else 0  #lambda R2
        self.obj = float(topt.obj) if topt.obj else 1  #objective value (1-GP)

        self.lambR1 = float(topt.lambR1) if topt.lambR2 else 0  #lambda R1

        self.epsilon = float(
            topt.epsilon) if topt.epsilon else 0  #epsilon (drift loss)

        self.cLazyReg = max(topt.cLazyReg, 1)

        self.kUnroll = int(topt.unrollCritic) if topt.unrollCritic else 0

        assert self.kUnroll >= 0, self.logger.error(
            f'Trainer ERROR: The unroll parameter is less than zero ({self.kUnroll})'
        )

        #Common model parameters
        common = {
            'fmapMax': mopt.fmapMax,
            'fmapMin': mopt.fmapMin,
            'fmapDecay': mopt.fmapDecay,
            'fmapBase': mopt.fmapBase,
            'activation': mopt.activation,
            'upsample': mopt.sampleMode,
            'downsample': mopt.sampleMode
        }

        #Generator model parameters
        self.gen = Generator(**common, **gopt).to(self.device)
        self.latentSize = self.gen.mapping.latentSize

        self.logger.info(
            f'Generator constructed. Number of parameters {sum([np.prod([*p.size()]) for p in self.gen.parameters()])}'
        )

        #Critic model parameters
        self.crit = Critic(**mopt, **copt).to(self.device)

        self.logger.info(
            f'Critic constructed. Number of parameters {sum([np.prod([*p.size()]) for p in self.crit.parameters()])}'
        )

        #Generator optimizer parameters
        glr, beta1, beta2, epsilon, lrDecay, lrDecayEvery, lrWDecay = list(
            goopt.values())

        assert lrDecay >= 0 and lrDecay <= 1, self.logger.error(
            'Trainer ERROR: The decay constant for the learning rate of the generator must be a constant between [0, 1]'
        )
        assert lrWDecay >= 0 and lrWDecay <= 1, self.logger.error(
            'Trainer ERROR: The weight decay constant for the generator must be a constant between [0, 1]'
        )
        self.gOptimizer = Adam(filter(lambda p: p.requires_grad,
                                      self.gen.parameters()),
                               lr=glr,
                               betas=(beta1, beta2),
                               weight_decay=lrWDecay,
                               eps=epsilon)

        if lrDecayEvery and lrDecay:
            self.glrScheduler = lr_scheduler.StepLR(self.gOptimizer,
                                                    step_size=lrDecayEvery *
                                                    self.tick,
                                                    gamma=lrDecay)
        else:
            self.glrScheduler = None

        self.logger.info(f'Generator optimizer constructed')

        #Critic optimizer parameters
        clr, beta1, beta2, epsilon, lrDecay, lrDecayEvery, lrWDecay = list(
            coopt.values())

        assert lrDecay >= 0 and lrDecay <= 1, self.logger.error(
            'Trainer ERROR: The decay constant for the learning rate of the critic must be a constant between [0, 1]'
        )
        assert lrWDecay >= 0 and lrWDecay <= 1, self.logger.error(
            'Trainer ERROR: The weight decay constant for the critic must be a constant between [0, 1]'
        )

        self.cOptimizer = Adam(filter(lambda p: p.requires_grad,
                                      self.crit.parameters()),
                               lr=clr,
                               betas=(beta1, beta2),
                               weight_decay=lrWDecay,
                               eps=epsilon)

        if lrDecayEvery and lrDecay:
            self.clrScheduler = lr_scheduler.StepLR(self.gOptimizer,
                                                    step_size=lrDecayEvery *
                                                    self.tick,
                                                    gamma=lrDecay)
        else:
            self.clrScheduler = None

        self.logger.info(f'Critic optimizer constructed')

        self.preWtsFile = opt.preWtsFile
        self.resumeTraining = bool(topt.resumeTraining)
        self.loadPretrainedWts(resumeTraining=self.resumeTraining)

        self.logger.info(
            f'The trainer has been instantiated.... Starting step: {self.imShown}. Resolution: {self.resolution}'
        )

        self.logArchitecture(clr, glr)

    def logArchitecture(self, clr, glr):
        """
        This function will print hyperparameters and architecture and save the in the log directory under the architecture.txt file
        """

        cstFcn = f'Cost function model: {self.lossFunc}\n'

        hyperParams = (
            f'HYPERPARAMETERS - res = {self.resolution}|bs = {self.dataLoader.batchSize}|cLR = {clr}|gLR = {glr}|lambdaR2 = {self.lambR2}|'
            f'obj = {self.obj}|lambdaR1 = {self.lambR1}|epsilon = {self.epsilon}|{self.loops} loops, showing {self.tick} images per loop'
            f'|Using pulling away regularization? {"Yes" if self.paterm  else "No"}'
        )

        architecture = '\n' + str(self.crit) + '\n\n' + str(self.gen) + '\n\n'

        self.logger.info(cstFcn + hyperParams)

        f = os.path.join(self.logger_.logPath, self.logger_.archFile)

        self.logger.debug(architecture)

        utils.writeFile(f, cstFcn + hyperParams + architecture, 'w')

    def loadPretrainedWts(self, resumeTraining=False):
        """
        Search for weight file in the experiment directory, and loads it if found
        """
        dir = self.preWtsFile
        if os.path.isfile(dir):
            try:
                stateDict = torch.load(
                    dir, map_location=lambda storage, loc: storage)
                self.crit.load_state_dict(stateDict['crit'])
                self.gen.load_state_dict(
                    stateDict['gen'], strict=False
                )  #Since the cached noise buffers are initialized at None
                self.logger.debug(f'Loaded pre-trained weights from {dir}')

                if resumeTraining:
                    self.imShown = stateDict['imShown']
                    self.loops = stateDict['loops']
                    self.tick = stateDict['tick']
                    self.logger_.genLoss = stateDict['genLoss']
                    self.logger_.criticLoss = stateDict['criticLoss']
                    self.logger_.criticLossReals = stateDict['criticLossReals']
                    self.logger_.criticLossFakes = stateDict['criticLossFakes']
                    self.logger_.logCounter = stateDict['logCounter']
                    self.logger_.ncAppended = stateDict['ncAppended']
                    self.logger_.ngAppended = stateDict['ngAppended']
                    self.logger_.snapCounter = stateDict['snapCounter']
                    self.logger_.imgCounter = stateDict['imgCounter']
                    self.cOptimizer.load_state_dict(stateDict['cOptimizer'])
                    self.gOptimizer.load_state_dict(stateDict['gOptimizer'])
                    self.clrScheduler.load_state_dict(
                        stateDict['clrScheduler'])
                    self.glrScheduler.load_state_dict(
                        stateDict['glrScheduler'])
                    self.batchShown = stateDict['batchShown']
                    self.meanPathLength = stateDict['meanPathLength']
                    self.logger.debug(f'And the optimizers states as well')

                return True
            except Exception as e:
                self.logger.error(
                    f'ERROR: The weights in {dir} could not be loaded\n {str(e)}\n Proceding from zero...'
                )
                return False
        else:
            self.logger.error(
                f'ERROR: The file {dir} does not exist. Proceding from zero...'
            )

        return False

    def getReals(self, n=None):
        """
        Returns n real images
        """
        return self.dataLoader.get(n).to(device=self.device)

    def getFakes(self, n=None, z=None):
        """
        Returns n fake images and their latent vectors
        """
        if n is None: n = self.dataLoader.batchSize

        if z is None:
            z = utils.getNoise(bs=n,
                               latentSize=self.latentSize,
                               device=self.device)

            if self.styleMixingProb and random() < self.styleMixingProb:
                zmix = utils.getNoise(bs=n,
                                      latentSize=self.latentSize,
                                      device=self.device)
                zmix = (zmix - zmix.mean(dim=1, keepdim=True)) / (
                    zmix.std(dim=1, keepdim=True) + 1e-8)
                output = self.gen(z, zmix=zmix)

            else:
                output = self.gen(z)

        else:
            output = self.gen(z)

        if isinstance(output, list):
            return [*output, z]
        else:
            return [output, z]

    def getBatchReals(self):
        """
        Returns a batch of real images
        """
        return self.dataLoader.get_batch().to(device=self.device)

    def getBatchFakes(self):
        """
        Returns a batch of fake images and the latent vector which generated it
        """
        return self.getFakes()

    def R2GradientPenalization(self, reals, fakes):
        alpha = torch.rand(reals.size(0), 1, 1, 1, device=reals.device)
        interpols = (alpha * reals +
                     (1 - alpha) * fakes).detach().requires_grad_(True)
        cOut = self.crit(interpols).sum()

        if self.applyLossScaling:
            cOut = applyLossScaling(cOut)

        ddx = autograd.grad(outputs=cOut,
                            inputs=interpols,
                            grad_outputs=torch.ones_like(cOut,
                                                         device=self.device),
                            create_graph=True,
                            retain_graph=True,
                            only_inputs=True)[0]

        ddx = ddx.view(ddx.size(0), -1)

        if self.applyLossScaling:
            ddx = undoLossScaling(ddx)

        return (
            (ddx.norm(dim=1) - self.obj).pow(2)).mean() / (self.obj + 1e-8)**2

    def R1GradientPenalization(self, reals):
        reals.requires_grad_(True)
        cOut = self.crit(reals).sum()

        if self.applyLossScaling:
            cOut = applyLossScaling(cOut)

        ddx = autograd.grad(outputs=cOut,
                            inputs=reals,
                            grad_outputs=torch.ones_like(cOut,
                                                         device=self.device),
                            create_graph=True,
                            retain_graph=True,
                            only_inputs=True)[0]

        ddx = ddx.view(ddx.size(0), -1)

        if self.applyLossScaling:
            ddx = undoLossScaling(ddx)

        return 0.5 * (ddx.pow(2).sum(dim=1)).mean()

    def GradientPathRegularization(self, fakes, latents):
        noise = torch.randn_like(fakes) / math.sqrt(
            fakes.size(2) * fakes.size(3))

        ddx = autograd.grad(outputs=(fakes * noise).sum(),
                            inputs=latents,
                            create_graph=True)[0]

        pathLengths = ddx.norm(dim=1)

        if self.meanPathLength == 0:
            self.meanPathLength = pathLengths.mean()

        else:
            self.meanPathLength = self.meanPathLength + self.plDecay * (
                pathLengths.mean() - self.meanPathLength)

        self.meanPathLength = self.meanPathLength.detach()

        return (pathLengths - self.meanPathLength).pow(2).mean()

    def trainCritic(self):
        """
        Train the critic for one step and store outputs in logger
        """
        utils.switchTrainable(self.crit, True)
        utils.switchTrainable(self.gen, False)

        # real
        real = self.dataLoader.get_batch().to(self.device)
        cRealOut = self.crit(x=real)

        # fake
        fake, *_ = self.getBatchFakes()
        cFakeOut = self.crit(x=fake.detach())

        lossReals = self.criterion(cRealOut, truth=1)
        lossFakes = self.criterion(cFakeOut, truth=-1)

        loss = lossReals + lossFakes

        if self.batchShown % self.cLazyReg == self.cLazyReg - 1:
            if self.lambR2:
                loss += self.cLazyReg * self.lambR2 * self.R2GradientPenalization(
                    real, fake)
            if self.epsilon:
                loss += self.epsilon * (cRealOut**2).mean()
            if self.lambR1:
                loss += self.lambR1 * self.R1GradientPenalization(real)

        self.cOptimizer.zero_grad()
        loss.backward()
        self.cOptimizer.step()

        if self.clrScheduler is not None:
            self.clrScheduler.step()  #Reduce learning rate

        self.logger_.appendCLoss(loss, lossReals, lossFakes)

    def trainGenerator(self):
        """
        Train Generator for 1 step and store outputs in logger
        """
        utils.switchTrainable(self.gen, True)
        utils.switchTrainable(self.crit, False)

        fake, *latents = self.getBatchFakes()
        cFakeOut = self.crit(fake)

        loss = self.criterion(cFakeOut, truth=1)

        if self.batchShown % self.gLazyReg == self.gLazyReg - 1:
            if self.pathRegWeight > 0:
                dlatent = latents[0]
                loss += self.GradientPathRegularization(
                    fake, dlatent) * self.gLazyReg * self.pathRegWeight

            if self.lambg > 0 and self.paterm:
                latent = latents[-1]
                pat = self.gen.paTerm(latent) * self.lambg * self.gLazyReg
                loss += pat

        self.gOptimizer.zero_grad()
        loss.backward()
        self.gOptimizer.step()

        if self.glrScheduler is not None:
            self.glrScheduler.step()  #Reduce learning rate

        self.logger_.appendGLoss(loss)

        return fake.size(0)

    def train(self):
        """
        Main train loop
        """

        self.logger.info('Starting training...')
        self.logger_.startLogging()  #Start the  logger

        # loop over images
        while self.imShown < self.tick * self.loops:
            if self.kUnroll:
                for i in range(self.nCritPerGen):
                    self.trainCritic()
                    if i == 0:
                        self.cBackup = copy.deepcopy(self.crit)
            else:
                for i in range(self.nCritPerGen):
                    self.trainCritic()

            shown = self.trainGenerator(
            )  #Use the generator training batches to count for the images shown, not the critic

            if self.kUnroll:
                self.crit.load(self.cBackup)

            self.imShown = self.imShown + int(shown)
            self.batchShown = self.batchShown + 1

            if self.batchShown > max(self.gLazyReg, self.cLazyReg):
                self.batchShown = 0

        self.logger_.saveSnapshot(
            f'{self.resolution}x{self.resolution}_final_{self.latentSize}')
Example #19
0
parser.add_argument(
    '--config',
    type=str,
    default=
    '/home/rudra/Downloads/rudra/relationship_modeling/o2p2/physics_engine/configs/pre-planning.yml',
    help='Path to config file')
opts = parser.parse_args()
params = get_config(opts.config)
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(params)

# Define models and dataloaders
train_loader, val_loader = initial_final_dataloader(params)
model = O2P2Model(params)

if params['use_cuda']:
    model = model.cuda()

exp_results_path = params['project_root'] + '/results/' + params[
    'exp_name'] + '_' + timestamp + '/'
exp_logs_path = params['project_root'] + '/logs/' + params[
    'exp_name'] + '_' + timestamp + '/'
mkdir_p(exp_logs_path)
mkdir_p(exp_results_path)

logger = Logger(exp_logs_path)

trainer = O2P2Trainer(params, model, train_loader, val_loader, logger,
                      exp_results_path, exp_logs_path)

trainer.train()
class FusionParser(object):
    """Get and parse results from previously run programs (fusion prediction, hla typing, expression estimation)"""

    # Initialization of parameters
    # urla: todo: pylint convention - too many arguments
    #             are all vars required to be class vars?
    def __init__(self, scratch_path, fusion_output_path, sample_id,
                 tool_num_cutoff, fusiontool_list, sample_log):
        """Parameter initiation and work folder creation."""
        self.scratch_path = scratch_path
        self.fusion_output_path = fusion_output_path
        self.sample_id = sample_id
        self.tool_num_cutoff = int(tool_num_cutoff)
        # urla: if we want to be more generic and allow different annotations, identification of the chr names
        #       (eg "chr1" vs "1" and "chrM" vs "MT") should be performed in advance
        self.chr_list = ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
                         "11", "12", "13", "14", "15", "16", "17", "18", "19",
                         "20", "21", "22", "X", "Y", "MT")
        self.tools = fusiontool_list.split(",")
        self.logger = Logger(sample_log)

    #####
    ### fusion tool parser
    #
    # fusioncatcher - results file is "summary_candidate_fusions.txt"
    def get_fusioncatcher_results(self):
        """Load and parse results from fusioncatcher"""
        fusioncatcher_predict_summary = os.path.join(
            self.scratch_path, "fusion", "fusioncatcher",
            "summary_candidate_fusions.txt")
        fusioncatcher_predict_detail = os.path.join(
            self.scratch_path, "fusion", "fusioncatcher",
            "final-list_candidate-fusion-genes.txt")
        reciprocal_fusions = []
        with open(fusioncatcher_predict_summary) as predict_summary:
            for line in predict_summary:
                if line.strip().startswith("*"):
                    # urla: todo: pylint warning - anomalous backslash
                    #             I don't really understand the problem and the RE is simple and working very fine
                    fusion_gene = re.search(r'\*\s([\S]*)', line).group(1)
                    if "reciprocal" in line:
                        reciprocal_fusions.append(
                            fusion_gene.replace("--", "_").upper())
        fusion_map = {}
        with open(fusioncatcher_predict_detail) as prediction:
            next(prediction)  # skip header line
            for line in prediction:
                elements = line.rstrip().split("\t")
                # Currently relevant fields (marked *) in the output file are:
                # * elements[0] ~ Gene_1_symbol(5end_fusion_partner)
                # * elements[1] ~ Gene_2_symbol(3end_fusion_partner)
                #   elements[2] ~ Fusion_description
                #   elements[3] ~ Counts_of_common_mapping_reads
                # * elements[4] ~ Spanning_pairs
                # * elements[5] ~ Spanning_unique_reads
                #   elements[6] ~ Longest_anchor_found
                #   elements[7] ~ Fusion_finding_method
                # * elements[8] ~ Fusion_point_for_gene_1(5end_fusion_partner)
                # * elements[9] ~ Fusion_point_for_gene_2(3end_fusion_partner)
                #   elements[10] ~ Gene_1_id(5end_fusion_partner)
                #   elements[11] ~ Gene_2_id(3end_fusion_partner)
                #   elements[12] ~ Exon_1_id(5end_fusion_partner)
                #   elements[13] ~ Exon_2_id(3end_fusion_partner)
                #   elements[14] ~ Fusion_sequence
                #   elements[15] ~ Predicted_effect
                fusion_gene = (elements[0] + "_" + elements[1]).upper()
                # if the fusion gene is reciprocal, the fusion id is reversed? <- what for??
                if fusion_gene in reciprocal_fusions:
                    fusion_gene = (elements[1] + "_" + elements[0]).upper()


#                for key in self.sub_dict:
#                    fusion_gene = fusion_gene.replace(key, self.sub_dict[key])

# urla: why not catching "Counts_of_common_mapping_reads" which indicate how similar the fusion partners are? <- FP's; should be 0 for max specificity
#common_map_num = elements[3]
# urla: according to online manual, junction reads are in elements[5] and all supporting (junction and spanning?) in elements[4]?!
# urla_c: I changed it here

# skip all prediction not on standard chromosomes
                if elements[8].split(":")[0] not in self.chr_list or elements[
                        9].split(":")[0] not in self.chr_list:
                    continue
                fgid = fusion_gene.split("_")[0] + "_" + elements[
                    8] + "_" + fusion_gene.split("_")[1] + "_" + elements[9]

                fusion_map[fgid] = [
                    fusion_gene,  # fusion_gene
                    elements[8],  # up_gene_bp
                    elements[9],  # dn_gene_bp
                    elements[
                        5],  # junc_reads_num - urla: todo: verify that this is correct
                    elements[
                        4],  # span_reads_num - urla: todo: verify that this is correct
                    self.sample_id,
                    "Fusioncatcher"
                ]
        return fusion_map

    # starfusion - results file is "star-fusion.fusion_candidates.final (not any more, now: star-fusion.fusion_predictions.abridged.tsv)"
    # in order to be compatible with old and new starfusion versions, the new file name is checked first, if not available, the old one is used.
    # urla - note: this is rather a hack then a proper solution, but proper version handling would actually require to dig through all versions
    #              and look for changes in file names and output columns and maybe additional stuff... this would properly require a tremendous
    #              amount of time and is therefore out of scope of this project
    def get_starfusion_results(self):
        """Load and parse results from star-fusion"""
        starfusion_predict = os.path.join(
            self.scratch_path, "fusion", "starfusion",
            "star-fusion.fusion_predictions.abridged.tsv")
        if not os.path.isfile(starfusion_predict):
            starfusion_predict = os.path.join(
                self.scratch_path, "fusion", "starfusion",
                "star-fusion.fusion_candidates.final")
        fusion_map = {}
        with open(starfusion_predict, "r") as prediction:
            next(prediction)  # skip header line
            for line in prediction:
                elements = line.rstrip().split("\t")
                # Currently relevant fields (marked *) in the output file are:
                # * elements[0] ~ FusionName
                # * elements[1] ~ JunctionReadCount
                # * elements[2] ~ SpanningFragCount
                #   elements[3] ~ SpliceType
                #   elements[4] ~ LeftGene
                # * elements[5] ~ LeftBreakpoint
                #   elements[6] ~ RightGene
                # * elements[7] ~ RightBreakpoint
                #   elements[8] ~ LargeAnchorSupport
                #   elements[9] ~ FFPM
                #   elements[10] ~ LeftBreakDinuc
                #   elements[11] ~ LeftBreakEntropy
                #   elements[12] ~ RightBreakDinuc
                #   elements[13] ~ RightBreakEntropy
                #   elements[14] ~ annots
                fusion_gene = elements[0].replace("--", "_").upper()
                # check whether fusion gene is not on primary chr
                if elements[5].split(":")[0] not in self.chr_list or elements[
                        7].split(":")[0] not in self.chr_list:
                    continue
                fgid = fusion_gene.split("_")[0] + "_" + elements[
                    5] + "_" + fusion_gene.split("_")[1] + "_" + elements[7]
                #self.logger.debug("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(fgid, fusion_gene, elements[5], elements[7], elements[1], elements[2], self.sample_id))
                fusion_map[fgid] = [
                    fusion_gene,  # fusion_gene
                    elements[5],  # up_gene_bp
                    elements[7],  # dn_gene_bp
                    elements[1],  # junc_reads_num
                    elements[2],  # span_reads_num
                    self.sample_id,
                    "Starfusion"
                ]
        return fusion_map

    # mapsplice2 - results file is "fusions_well_annotated.txt"
    def get_mapsplice_results(self):
        """Load and parse results from mapsplice"""
        mapsplice_predict = os.path.join(self.scratch_path, "fusion",
                                         "mapsplice",
                                         "fusions_well_annotated.txt")
        fusion_map = {}
        with open(mapsplice_predict) as prediction:
            # next(prediction) # mapsplice final result table, doesn't have a header!
            for line in prediction:
                elements = line.rstrip().split("\t")
                # Currently relevant fields (marked *) in the output file are:
                # * elements[0] ~ chrom: the two chromosomes involved in fusion junction
                # * elements[1] ~ doner_end: The end position of doner site of splicing on chromosome
                # * elements[2] ~ acceptor_start: The start position of acceptor site of splicing on chromosome
                #   elements[3] ~ id: The id of fusion junction
                # * elements[4] ~ coverage: number of reads aligned to the fusion junction
                # * elements[5] ~ strand: strand of the reads mapped to the two chromosomes
                #   elements[6] ~ rgb: An RGB value of the form R,G,B
                #   elements[7] ~ block_count:  The number of blocks in the BED line
                #   elements[8] ~ block_size: A comma-separated list of the block sizes.
                #   elements[9] ~ block_distance: A comma-separated list of block distance.
                #   elements[10] ~ entropy: entropy of the fusion junction.
                #   elements[11] ~ flank_case: non-zero for canonical and semi-canonical junctions (ATAC 1;GTAT 2;CTGC 3;GCAG 4;GTAG 5;CTAC 6;others 0)
                #   elements[12] ~ flank_string: the two basepairs after doner site combined the two basepairs before acceptor site
                #   elements[13] ~ min_mismatch: Minimal mismatch of read mapped to the fusion junction
                #   elements[14] ~ max_mismatch: Maximal mismatch of read mapped to the fusion junction
                #   elements[15] ~ ave_mismatch: Average mismatch of all reads mapped to the junction
                #   elements[16] ~ max_min_suffix:  if doner site is shorter than acceptor site, and if the doner site is longer than current maximal doner site length, then update current maximal doner site length
                #   elements[17] ~ max_min_prefix: if acceptor site is shorter than doner site, and if the doner site is longer than current maximal acceptor site length, then update current maximal acceptor site length
                #   elements[18] ~ min_anchor_difference: Minimal difference between doner site and acceptor site
                # * elements[19] ~ unique_read_count: Number of uniquely mapped reads mapped to the fusion
                #   elements[20] ~ multi_read_count: Number of multiple mapped reads mapped to the fusion
                #   elements[21] ~ paired_read_count: Number of reads mapped to fusion and can be paired with their mates near the fusion
                #   elements[22] ~ left_paired_read_count: Number of paired reads that the read itself is mapped to the left of its mate on genome
                #   elements[23] ~ right_paired_read_count: Number of paired reads that the read itself is mapped to the right of its mate on genome
                #   elements[24] ~ multiple_paired_read_count: Number of multiple mapped reads mapped to the fusion and are paired with their mates
                #   elements[25] ~ unique_paired_read_count: Number of uniquely mapped reads mapped to the fusion and are paired with their mates
                #   elements[26] ~ single_read_count: Number of reads mapped to the fusion but can't be paired with their mates
                #   elements[27] ~ encompassing_read pair_count: Number of reads pairs surround the fusion(but not cross the fusion)
                #   elements[28] ~ doner_start: The start of doner site of splicing on chromosome
                #   elements[29] ~ acceptor_end: The end of acceptor site of splicing on chromosome
                #   elements[30] ~ doner_iosforms: The isoform(transcript) structure on the doner site. each isoform structure is separated by '|'. The format of each isoform structure is the "start_of_the_isoform,CIGAR_string_of_structure". E.g. 59445681,180M12006N66M8046N47M|59445681,180M20118N47M| Two isoforms start at 59445681
                #   elements[31] ~ acceptor_isoforms: The isoform(transcript) structure on the acceptor site.
                #   elements[32] ~ doner uniformity score (obsolete): The p-value of T-test against the hypothesis that the start of spanning read pairs and encompassing read pairs distribute uniformly on the doner site
                #   elements[33] ~ acceptor uniformity score (obsolete): The p-value of Kolmogorov-Smirnov test against the hypothesis that the end of spanning read pairs and encompassing read pairs distribute uniformly on the acceptor site
                #   elements[34] ~ doner uniformity KS-test score (obsolete): The score of Kolmogorov-Smirnov test against the hypothesis that the start of spanning read pairs and encompassing read pairs distribute uniformly on the doner site
                #   elements[35] ~ acceptor uniformity KS-test score (obsolete): The score of Kolmogorov-Smirnov test against the hypothesis that the end of spanning read pairs and encompassing read pairs distribute uniformly on the acceptor site
                #   elements[36] ~ minimal_doner_isoform_length: Minimal length of isoform structure on the doner site
                #   elements[37] ~ maximal_doner_isoform_length: Maximal length of isoform structure on the doner site
                #   elements[38] ~ minimal_acceptor_isoform_length: Minimal length of isoform structure on the acceptor site
                #   elements[39] ~ maximal_acceptor_isoform_length: Maximal length of isoform structure on the acceptor site
                #   elements[40] ~ paired_reads_entropy: entropy of different read pairs
                #   elements[41] ~ mismatch_per_bp: Average mismatch per base.
                #   elements[42] ~ anchor_score: Anchor score.
                #   elements[43] ~ max_doner_fragment: Maximum doner fragment size.
                #   elements[44] ~ max_acceptor_fragment: Maximum acceptor fragment size.
                #   elements[45] ~ max_cur_fragment: Maximum total fragment length of doner and acceptor.
                #   elements[46] ~ min_cur_fragment: Minimum total fragment length of doner and acceptor
                #   elements[47] ~ ave_cur_fragment: Average total fragment length of doner and acceptor.
                #   elements[48] ~ doner_encompass_unique: Number of uniquely mapped reads surround the donor site of the fusion
                #   elements[49] ~ doner_encompass_multiple: Number of multiple mapped reads surround the donor site of the fusion
                #   elements[50] ~ acceptor_encompass_unique: Number of uniquely mapped reads surround the acceptor site of the fusion
                #   elements[51] ~ acceptor_encompass_multiple: Number of multiple mapped reads surround the acceptor site of the fusion
                #   elements[52] ~ doner_match_to_normal: If the fusion doner site is matched to a normal splice junction. 1 is matched, 0 is not matched
                #   elements[53] ~ acceptor_match_to_normal: If the fusion doner site is matched to a normal splice junction. 1 is matched, 0 is not matched
                #   elements[54] ~ doner_seq: The 25bp sequence at doner site matched to the fusion reads, doner end base included.
                #                  if doner strand is +, it is chrom1[donerEnd-24:donerEnd]
                #                  if doner strand  is -, it is revcomp(chrom1[donerEnd:donerEnd+24]))
                #   elements[55] ~ acceptor_seq: The 25bp sequence at acceptor site matched to the fusion reads, acceptor start base included.
                #                  if acceptor strand is +, it is chrom2[acceptorStart:acceptorStart+24]
                #                  if acceptor strand is -, it is revcomp(chrom2[acceptorStart-24:acceptorStart])
                #                  Note: due to our internal use purposes, the acceptor_seq is always reverse complemented again in MapSplice fusion junction file, which makes it effectively as the following. You can do a reverse completement to acceptor_seq to make the sequence exactly the same as it is mentioned above:
                #                  if acceptor strand is +, it is revcom(chrom2[acceptorStart:acceptorStart+24])
                #                  if acceptor strand is -, it is chrom2[acceptorStart-24:acceptorStart]
                #   elements[56] ~ match_gene_strand (only if --gene-gtf specified): If the fusion strand matched with the annotated gene strand. 1 is matched, 0 is not matched
                #   elements[57] ~ annotated_type (only if --gene-gtf specified): The source of fusion
                #                  from_fusion: The fusion is from fusion alignments
                #                  from_normal: The fusion is from normal alignments, which is normal junction cross two genes(read through fusions)
                #   elements[58] ~ fusion_type (only if --gene-gtf specified): The type of fusion based on the annotated gene
                #                  fusion: The start and end of the fusion is annotated to two distinct genes
                #                  normal: The start and end of the fusion is annotated to same gene(Circular RNAs)
                #                  intergenic: Either the start or end has no gene annotated
                #                  overlapped?
                #   elements[59] ~ gene_strand (only if --gene-gtf specified): The annotated genes strands, if there are
                # * elements[60] ~ annotated_gene_donor (only if --gene-gtf specified): The name of the gene annotated to the doner site of the fusion
                # * elements[61] ~ annotated_gene_acceptor (only if --gene-gtf specified): The name of the gene annotated to the acceptor site of the fusion
                fusion_gene = (elements[60].split(",")[0] + "_" +
                               elements[61].split(",")[0]).upper()

                # element[0] = chr num, [1/2] = breakpoints, [5] = strands
                up_gene_id = elements[0].split(
                    "~")[0] + ":" + elements[1] + ":" + elements[5][0]
                dn_gene_id = elements[0].split(
                    "~")[1] + ":" + elements[2] + ":" + elements[5][1]

                if up_gene_id.split(
                        ":")[0] not in self.chr_list or dn_gene_id.split(
                            ":")[0] not in self.chr_list:
                    continue
                fgid = fusion_gene.split(
                    "_")[0] + "_" + up_gene_id + "_" + fusion_gene.split(
                        "_")[1] + "_" + dn_gene_id

                fusion_map[fgid] = [
                    fusion_gene,  # fusion_gene
                    up_gene_id,  # up_gene_bp
                    dn_gene_id,  # dn_gene_bp
                    elements[4],  # junc_reads_num
                    elements[27],  # span_reads_num
                    self.sample_id,
                    "Mapsplice"
                ]
        return fusion_map

    # starchip - results file is "starchip.summary"
    def get_starchip_results(self):
        """Load and parse results from starchip"""
        starchip_predict = os.path.join(self.scratch_path, "fusion",
                                        "starchip", "starchip.summary")
        fusion_map = {}
        with open(starchip_predict, "r") as prediction:
            next(prediction)  # skip header line
            for line in prediction:
                elements = line.rstrip().split("\t")
                # Currently relevant fields (marked *) in the output file are:
                # * elements[0] ~ Partner1
                # * elements[1] ~ Partner2
                # * elements[2] ~ SpanningReads
                # * elements[3] ~ SplitReads
                #   elements[4] ~ AvgAS
                # * elements[5] ~ NearGene1
                #   elements[6] ~ Distance1
                # * elements[7] ~ NearGene2
                #   elements[8] ~ Distance2
                #   elements[9] ~ ConsensusSeq
                fusion_gene = "{0}_{1}".format(elements[5],
                                               elements[7]).upper()

                # check whether fusion gene is not on primary chr
                if elements[0].split(":")[0] not in self.chr_list or elements[
                        1].split(":")[0] not in self.chr_list:
                    continue
                fgid = elements[5] + "_" + elements[0] + "_" + elements[
                    7] + "_" + elements[1]

                fusion_map[fgid] = [
                    fusion_gene,  # fusion_gene
                    elements[0],  # up_gene_bp
                    elements[1],  # dn_gene_bp
                    elements[3],  # junc_reads_num
                    elements[2],  # span_reads_num
                    self.sample_id,
                    "Starchip"
                ]
        return fusion_map

    # infusion - results file is "fusions.detailed.txt"
    def get_infusion_results(self):
        """Load and parse results from starchip"""
        infusion_predict = os.path.join(self.scratch_path, "fusion",
                                        "infusion", "fusions.detailed.txt")
        fusion_map = {}
        with open(infusion_predict, "r") as prediction:
            next(prediction)  # skip header line
            for line in prediction:
                elements = line.rstrip().split("\t")
                # Currently relevant fields (marked *) in the output file are:
                #   elements[0] ~ id
                #   elements[1] ~ ref1
                #   elements[2] ~ break_pos1
                #   elements[3] ~ region1
                #   elements[4] ~ ref2
                #   elements[5] ~ break_pos2
                #   elements[6] ~ region2
                #   elements[7] ~ num_split
                #   elements[8] ~ num_paired
                #   elements[9] ~ num_split_with_pair
                #   elements[10] ~ num_split_rescued
                #   elements[11] ~ num_uniq_starts
                #   elements[12] ~ pap_rate
                #   elements[13] ~ mean_split_pos
                #   elements[14] ~ split_pos_std
                #   elements[15] ~ homogeneity
                #   elements[16] ~ coverage_context
                #   elements[17] ~ ssp
                #   elements[18] ~ fusion_class
                #   elements[19] ~ break_on_exon
                #   elements[20] ~ feature_1
                #   elements[21] ~ gene_1
                #   elements[22] ~ transcript_1
                #   elements[23] ~ gene_1_strand
                #   elements[24] ~ biotype_1
                #   elements[25] ~ expression_1
                #   elements[26] ~ feature_2
                #   elements[27] ~ gene_2
                #   elements[28] ~ transcript_2
                #   elements[29] ~ gene_2_strand
                #   elements[30] ~ biotype_2
                #   elements[31] ~ expression_2
                #   elements[32] ~ splice_motif
                #   elements[33] ~ filters
                fusion_gene = "{0}_{1}".format(
                    self.get_fusion_gene_id_infusion(elements[21],
                                                     elements[22]),
                    self.get_fusion_gene_id_infusion(elements[27],
                                                     elements[28])).upper()

                # element[1/4] = chr num, [2/5] = breakpoints, [23/29] = strands
                up_gene_id = elements[1] + ":" + elements[2] + ":" + elements[
                    23]
                dn_gene_id = elements[4] + ":" + elements[5] + ":" + elements[
                    29]

                # check whether fusion gene is not on primary chr
                if elements[1] not in self.chr_list or elements[
                        4] not in self.chr_list:
                    continue
                fgid = fusion_gene.split(
                    "_")[0] + "_" + up_gene_id + "_" + fusion_gene.split(
                        "_")[1] + "_" + dn_gene_id

                fusion_map[fgid] = [
                    fusion_gene,  # fusion_gene
                    up_gene_id,  # up_gene_bp
                    dn_gene_id,  # dn_gene_bp
                    elements[7],  # junc_reads_num
                    elements[8],  # span_reads_num
                    self.sample_id,
                    "Infusion"
                ]
        return fusion_map

    @staticmethod
    def get_fusion_gene_id_infusion(gene_id, transcript_list_field):
        """Helper method for infusion data parsing. Returns the most frequently listed fusion id in a list of fusion ids"""
        # if only 1 gene id is returned as fusion partner, return this
        if not ";" in gene_id:
            return gene_id.replace("_", "-")
        # if 2 or more gene ids are given, take the most frequent from the transcript list
        gid_dict = {}
        for gid in transcript_list_field.split(";"):
            id_test = gid.split(":")[0][:-4]
            if not id_test in gid_dict:
                gid_dict[id_test] = 0
            gid_dict[id_test] += 1

        best_hit = max(gid_dict.iteritems(), key=operator.itemgetter(1))[0]

        return best_hit.replace("_", "-")

    # soapfuse - results file is "*.final.Fusion.specific.for.genes" in "final_fusion_genes"
    # urla - note: it seems like soapfuse is prefixing chromsome ids with "chr" even if eg ensembl data is used which does not have this
    #              I'm replacing this here because this is our recommended dataset...
    def get_soapfuse_results(self):
        """Load and parse results from soapfuse"""
        soapfuse_predict = ""
        folder_to_scan = os.path.join(self.scratch_path, "fusion", "soapfuse",
                                      "final_fusion_genes")
        for filename in os.listdir(folder_to_scan):
            folder_path = os.path.join(folder_to_scan, filename)
            if os.path.isdir(folder_path):
                for res in os.listdir(folder_path):
                    if res.endswith(".final.Fusion.specific.for.genes"):
                        soapfuse_predict = os.path.join(folder_path, res)
        if not soapfuse_predict:
            soapfuse_predict = os.path.join(
                self.scratch_path, "fusion", "soapfuse", "final_fusion_genes",
                self.sample_id,
                self.sample_id + ".final.Fusion.specific.for.genes")
        fusion_map = {}
        with open(soapfuse_predict) as prediction:
            next(prediction)  # skip header line
            for line in prediction:
                elements = line.rstrip().split("\t")
                # Currently relevant fields (marked *) in the output file are:
                # * elements[0] ~ up_gene
                # * elements[1] ~ up_chr
                # * elements[2] ~ up_strand
                # * elements[3] ~ up_Genome_pos
                #   elements[4] ~ up_loc
                # * elements[5] ~ dw_gene
                # * elements[6] ~ dw_chr
                # * elements[7] ~ dw_strand
                # * elements[8] ~ dw_Genome_pos
                #   elements[9] ~ dw_loc
                # * elements[10] ~ Span_reads_num
                # * elements[11] ~ Junc_reads_num
                #   elements[12] ~ Fusion_Type
                #   elements[13] ~ down_fusion_part_frame-shift_or_not
                fusion_gene = (elements[0] + "_" + elements[5]).upper()

                # element[1/6] = chr num, [3/8] = breakpoints, [2/7] = strands
                up_gene_id = elements[1] + ":" + elements[3] + ":" + elements[2]
                dn_gene_id = elements[6] + ":" + elements[8] + ":" + elements[7]

                # check whether fusion gene is not on primary chr
                if elements[1] not in self.chr_list or elements[
                        6] not in self.chr_list:
                    continue
                fgid = fusion_gene.split(
                    "_")[0] + "_" + up_gene_id + "_" + fusion_gene.split(
                        "_")[1] + "_" + dn_gene_id

                fusion_map[fgid] = [
                    fusion_gene,  # fusion_gene
                    up_gene_id,  # up_gene_bp
                    dn_gene_id,  # dn_gene_bp
                    elements[11],  # junc_reads_num
                    elements[10],  # span_reads_num
                    self.sample_id,
                    "Soapfuse"
                ]
        return fusion_map

    # pizzly - results file is "kallizzy.json.txt"
    def get_pizzly_results(self):
        """Load and parse results from pizzly"""
        pizzly_predict = os.path.join(self.scratch_path, "fusion", "pizzly",
                                      "kallizzy.json.txt")
        fusion_map = {}
        with open(pizzly_predict, "r") as prediction:
            next(prediction)  # skip header line
            for line in prediction:
                elements = line.rstrip().split("\t")
                # Currently relevant fields (marked *) in the output file are:
                # * elements[0] ~ geneA.name
                #   elements[1] ~ geneA.id
                # * elements[2] ~ geneB.name
                #   elements[3] ~ geneB.id
                # * elements[4] ~ paircount
                # * elements[5] ~ splitcount
                #   elements[6] ~ transcripts.list

                # Pizzly is a overpredictor with a high FP ratio in its current version
                # Therefore, only events supported by at least (paircount + splitcount >=3) are considered
                if int(elements[4]) + int(elements[5]) < 3:
                    continue
                fusion_gene = "{0}_{1}".format(elements[0],
                                               elements[2]).upper()

                # check whether fusion gene is not on primary chr - not possible for pizzly as exact breakpoint cannot directly be determined from pizzly output
                #                    if elements[0].split(":")[0] not in self.chr_list or elements[1].split(":")[0] not in self.chr_list:
                #                        continue
                fgid = elements[0] + "_1:100:+_" + elements[2] + "_2:100:+"

                fusion_map[fgid] = [
                    fusion_gene,  # fusion_gene
                    "1:100:+",  # up_gene_bp
                    "2:100:+",  # dn_gene_bp
                    elements[5],  # junc_reads_num
                    elements[4],  # span_reads_num
                    self.sample_id,
                    "Pizzly"
                ]
        return fusion_map

    #####
    ### parser of fusion tool parser outputs
    #
    # urla: at least writing to tool_state_path is sort of obsolete as it is never used downstream of here
    def concatenate_fusion_results(self, tool_state_path, fusion_output_path):
        """Return tuple of (dict of results dicts, dict of errors) and writes error/pass to summary file"""
        with open(tool_state_path, "a") as outf:
            fusion_result_dict = {}  # stores the complete fusion map per tool
            results_with_errors_dict = {
            }  # stores booleans whether exeption was raised during tool exe

            self.logger.info("Processing " + self.sample_id)
            sample_string = self.sample_id
            for tool in self.tools:
                fusion_result_dict[tool], results_with_errors_dict[
                    tool] = self.get_tool_results(fusion_output_path, tool)

                if results_with_errors_dict[tool]:
                    sample_string += ";0"
                else:
                    sample_string += ";1"
            sample_string.rstrip(";")
            outf.write(sample_string + "\n")
        return (fusion_result_dict, results_with_errors_dict)

    def get_tool_results(self, output_folder_path, tool):
        """Return tuple of (dict of results from individual fusion tools, error type)"""
        pred_res_dict = {}  # dictionary of prediction results
        self.logger.info("Parsing results for " + tool)
        try:
            if tool == "Fusioncatcher":
                pred_res_dict = self.get_fusioncatcher_results()
            elif tool == "Starfusion":
                pred_res_dict = self.get_starfusion_results()
            elif tool == "Mapsplice":
                pred_res_dict = self.get_mapsplice_results()
            elif tool == "Starchip":
                pred_res_dict = self.get_starchip_results()
            elif tool == "Infusion":
                pred_res_dict = self.get_infusion_results()
            elif tool == "Soapfuse":
                pred_res_dict = self.get_soapfuse_results()
            elif tool == "Pizzly":
                pred_res_dict = self.get_pizzly_results()
        # pylint exceptions:
        #        the caught exception is not further specified as several different exceptions may be raised during processing
        #        the type of exception is, however, unimportant for further processing, because any exception must be manually reviewed
        except Exception as ukn_err:  # pylint: disable=W0703
            self.logger.error(
                "Couldn't fetch results from {0}, please check data in {1}. Error message: {2}"
                .format(tool, output_folder_path, ukn_err))
            return (pred_res_dict, True)

        tool_res_file = os.path.join(output_folder_path, tool + "_res.csv")
        with open(tool_res_file, "w") as tool_outf:
            tool_outf.write(
                "fgid;fusion_gene;breakpoint1;breakpoint2;junc_reads;span_reads;sample_id;tool\n"
            )
            for key in pred_res_dict:
                tool_outf.write(key + ";" + ";".join(pred_res_dict[key]) +
                                "\n")
        return (pred_res_dict, False)

    # pylint: enable=W0703

    # urla: naming is terribly complicated to understand and follow, changed every single var name!
    #       todo: the method seems to be very complicated for a relatively simple task => is there a more simple solution?
    # @param dict_of_fusion_results: This is a dictionary of dictionaries.
    #        For each fusion prediction tool, a dictionary is created with keys=fuid
    #        and value=fusion_info; these are themselfes organised in a dict
    #        with keys=fusion_tool and values=fusion_tool_dict
    def lookup_fusions_in_prediction(self, dict_of_fusion_dicts):
        """UKN"""
        # @param dict_of_fusionid_lists: This is a dictionary of lists.
        #        For each fusion tool (=keys of the dict) it contains a
        #        list (=value of the dict) the keys from the respective
        #        fusion tool dict
        dict_of_fusionid_lists = {}

        # for every tool in the results dict
        for fusion_tool in dict_of_fusion_dicts:
            # if no fusions were predicted by a tool, set an empty list
            # urla: is this actually required? it would anyway become an empty
            #       list in the for loop, wouldn't it?
            if len(dict_of_fusion_dicts[fusion_tool]) == 0:  # this is a "number of elements" check => pylint: disable=C1801
                dict_of_fusionid_lists[fusion_tool] = []

            # for each fusion id (=keys of the respective fusion dict),
            # append them to a list in the new dict
            # urla: todo: the exception should only be raised, if the list is
            #             not existing (remove generalization)
            #       todo2: why not initializing an empty list for all in advance?
            #              i.e.: putting "dict_of_fusionid_lists[fusion_tool] = []"
            #              at the start of the for loop (would also eliminate the if len(...))
            for fusion_id in dict_of_fusion_dicts[fusion_tool]:
                try:
                    dict_of_fusionid_lists[fusion_tool].append(fusion_id)
                except KeyError:
                    print(
                        "Error when trying to append to list of tool {0}. Trying to create new list with {1} at start"
                        .format(fusion_tool, fusion_id))
                    dict_of_fusionid_lists[fusion_tool] = [fusion_id]

        # create a list of unique fusion ids
        list_of_all_fusion_ids = []
        for tool in dict_of_fusionid_lists:
            list_of_all_fusion_ids += dict_of_fusionid_lists[tool]
        list_of_unique_fusion_ids = list(set(list_of_all_fusion_ids))

        # @param dict_of_found_uniq_fusions: A dictionary of lists, where keys
        #        are the unique fusion ids and values a list of booleans indicating
        #        whether or not the fusion was found in a fusion prediction tool
        dict_of_found_uniq_fusions = {}
        for uniq_fusion_id in list_of_unique_fusion_ids:
            # split the fusion id into [0]=gene1, [1]=breakpoint of gene1,
            # [2]=gene2, [3]=breakpoint of gene2
            uniq_fusion_id_split = uniq_fusion_id.split("_")

            list_of_found_fusion_booleans = []
            # for each fusion tool
            for fusion_tool in dict_of_fusion_dicts:
                found_fusion = False
                # for each fusion id in the list of fusions per tool
                for fusion_id in dict_of_fusionid_lists[fusion_tool]:
                    fusion_id_split = fusion_id.split("_")
                    # check if gene1 and gene2 of the unique fusion which is being tested
                    # is present in the current fusion (independent of the orientation)
                    # urla: this will lead to false positive results:
                    #       eg: fusion with gene1 "AB1" and gene2 "AB2" will match to fusion "AB11"-"AB22"
                    #if uniq_fusion_id_split[0] in fusion_id_split and uniq_fusion_id_split[2] in fusion_id_split:
                    # urla: possible solution:
                    if ((uniq_fusion_id_split[0] == fusion_id_split[0]
                         and uniq_fusion_id_split[2] == fusion_id_split[2]) or
                        (uniq_fusion_id_split[0] == fusion_id_split[2]
                         and uniq_fusion_id_split[2] == fusion_id_split[0])):
                        found_fusion = True
                        break  # we don't need to look further if it was found at least once
                list_of_found_fusion_booleans.append(found_fusion)

            if sum(list_of_found_fusion_booleans) >= self.tool_num_cutoff:
                dict_of_found_uniq_fusions[
                    uniq_fusion_id] = list_of_found_fusion_booleans
        return dict_of_found_uniq_fusions

    def run(self):
        """ asd """
        tool_state_path = os.path.join(self.fusion_output_path,
                                       "tool_state.csv")
        with open(tool_state_path, "w") as tool_state:
            tool_state.write("Sample ID")
            for tool in self.tools:
                tool_state.write(", {}".format(tool))
            tool_state.write("\n")

        detected_fusions_file = os.path.join(self.fusion_output_path,
                                             "Detected_Fusions.csv")
        with open(detected_fusions_file, "w") as fus_file:
            # write header
            fus_file.write(
                "FGID;Fusion_Gene;Breakpoint1;Breakpoint2;Junction_Reads;Spanning_Reads;Sample;Tool\n"
            )
            count_fusions = 0
            self.logger.debug("Generating Detected Fusions table")

            fusion_result_dict, results_with_errors_dict = self.concatenate_fusion_results(
                tool_state_path, self.fusion_output_path)
            print(len(fusion_result_dict))
            if sum(results_with_errors_dict.values()) == len(
                    fusion_result_dict):
                self.logger.error(
                    "Fusion parsing failed completely. Revision required. Aborting."
                )
                sys.exit(1)
            elif sum(results_with_errors_dict.values()) != 0:
                self.logger.error(
                    "Results incomplete. Please make sure that all tools have run completely on dataset."
                )

            dict_of_boolean_list_of_found_uniq_fusions = self.lookup_fusions_in_prediction(
                fusion_result_dict)  # this is snake case, although too long... pylint: disable=C0103
            # for each unique fusion
            for uniq_fusion_id in dict_of_boolean_list_of_found_uniq_fusions:
                # for each fusion tool
                for fusion_tool_num_in_list, fusion_tool in enumerate(
                        fusion_result_dict, 0):
                    # if the unique fusion was found in a fusion tool
                    if dict_of_boolean_list_of_found_uniq_fusions[
                            uniq_fusion_id][fusion_tool_num_in_list]:
                        # for each fusion id of fusion tool X
                        for fusion_id in fusion_result_dict[fusion_tool]:
                            # if the fusion id of the tool matches the unique fusion id, write everything to file
                            # urla: it is probably better to iterate over all fusion, instead of using "get" on the dict
                            #       because a fusion gene can be called more than once with sligthly different breakpoints
                            if fusion_id == uniq_fusion_id:
                                count_fusions += 1
                                fus_file.write(
                                    uniq_fusion_id + ";" +
                                    ";".join(fusion_result_dict[fusion_tool]
                                             [fusion_id]) + "\n")

        self.logger.info("Wrote {0} detected fusion genes to {1}.".format(
            count_fusions, detected_fusions_file))
def eval_patch_shuffle(model,
                       dataset_builder,
                       max_num_devide: int,
                       num_samples: int,
                       batch_size: int,
                       num_workers: int,
                       top_k: int,
                       log_dir: str,
                       log_params: dict = {},
                       suffix: str = '',
                       shuffle: bool = False,
                       **kwargs):
    """
    Args
    - model: NN model
    - dataset_builder: DatasetBuilder class object
    - max_num_devide: max number of division
    - num_samples: number of sample to use. if -1, all samples are used
    - batch_size: size of batch
    - num_workers: number of workers
    - top_k: use top_k accuracy
    - log_dir: log directory
    - log_params: params which is logged in dataframe. these params are useful for legend.
    - suffix: suffix of log
    - shuffle: shuffle data
    """
    assert max_num_devide >= 1
    assert num_samples >= 1 or num_samples == -1
    assert batch_size >= 1
    assert num_workers >= 1
    assert top_k >= 1

    log_path = os.path.join(
        log_dir, os.path.join('pathch_shuffle_result' + suffix + '.csv'))
    logger = Logger(path=log_path, mode='test')

    # log params
    # logger.log(log_params)

    acc_dict = {}
    images_list = []

    for num_devide in tqdm.tqdm(range(1, max_num_devide + 1)):
        log_dict = collections.OrderedDict()

        # build Patch Shuffled dataset
        patch_shuffle_transform = PatchShuffle(num_devide, num_devide)
        dataset = dataset_builder(train=False,
                                  normalize=True,
                                  optional_transform=[patch_shuffle_transform])
        if num_samples != -1:
            num_samples = min(num_samples, len(dataset))
            indices = [i for i in range(num_samples)]
            dataset = torch.utils.data.Subset(dataset, indices)
        loader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=shuffle,
                                             num_workers=num_workers,
                                             pin_memory=True)

        with torch.autograd.no_grad():
            num_correct = 0.0
            for i, (x, t) in enumerate(loader):
                model.eval()
                x = x.to('cuda', non_blocking=True)
                t = t.to('cuda', non_blocking=True)

                model.zero_grad()
                logit = model(x)
                num_correct += get_num_correct(logit, t, topk=top_k)

                if i == 0:
                    images_list.append(x[10])

        acc = num_correct / float(len(dataset))
        key = '{num_devide}'.format(num_devide=num_devide)
        acc_dict[key] = acc

        log_dict['num_devide'] = num_devide
        log_dict['accuracy'] = acc
        logger.log(log_dict)
        print(acc_dict)

    # save data
    torch.save(
        acc_dict,
        os.path.join(log_dir, 'patch_shuffle_acc_dict' + suffix + '.pth'))
    torchvision.utils.save_image(torch.stack(images_list, dim=0),
                                 os.path.join(
                                     log_dir,
                                     'example_images' + suffix + '.png'),
                                 nrow=max_num_devide)
    plot(csv_path=log_path,
         x='num_devide',
         y='accuracy',
         hue=None,
         log_path=os.path.join(log_dir, 'plot.png'),
         save=True)