def __init__(self, bam, output, context_file): """Parameter initialization""" self.bam_file = bam self.in_bam = pysam.AlignmentFile(bam, "rb") self.filtered_bam = pysam.AlignmentFile(output, "wb", template=self.in_bam) # a list of ints is used to count pairs assigned to different filters # 0: count_input_alignments # 1: count_input_pairs # 2: count_filtered_pairs # 3: count_multimapped # 4: count_star_chimeric_alignments # 5: count_qcd_alignments # 6: count_unmapped # 7: count_10bp_s_clip # 8: count_proper_pair # 9: count_not_filtered_but_in_fusion_gene self.counter = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.last_time = 0 self.logger = Logger("{}.fusionReadFilterLog".format(output)) self.coord_dict = {} self.get_ranges(context_file) print(self.coord_dict)
def __init__(self, ftype): self.__ftype = ftype self.__redis_link = self.__redis_connect() mylog = Logger( os.path.join(os.path.abspath(os.path.curdir), 'misc/spider_log.yaml')) self.__logger = mylog.outputLog()
def main(): mylog = Logger( os.path.join(os.path.abspath(os.path.curdir), 'misc/spider_log.yaml')) logger = mylog.outputLog() year = input("请输入年份:") allurl = get_links(year) downurl(allurl, logger)
def main(): print("欢迎使用 美剧天堂 爬取脚本") print("=" * 20) print("魔幻/科幻:1\n灵异/惊悚:2\n都市/感情:3\n犯罪/历史:4\n选秀/综艺:5\n动漫/卡通:6") print("=" * 20) ftype = input('请输入需要爬取的类型的代号:') start_url = "http://www.meijutt.com/file/list%s.html" % ftype ourl = openurl.OpenUrl(start_url, 'gb2312') code, doc = ourl.openurl() mylog = Logger( os.path.join(os.path.abspath(os.path.curdir), 'misc/spider_log.yaml')) logger = mylog.outputLog() if code == 200: selecter = etree.HTML(doc) pages = selecter.xpath( "//div[@class='page']/span/text()")[0].split()[0].split('/')[1] firstpage_links = selecter.xpath("//a[@class='B font_14']/@href") for firstpage_link in firstpage_links: name, download_links = get_downlink(firstpage_link) send_mysql(name, download_links, logger) time.sleep(0.5) for page in range(2, int(pages)): page_url = 'http://www.meijutt.com/file/list%s_%s.html' % (ftype, page) for link in page_link(page_url): name, download_links = get_downlink(link) if name != '' and download_links != '': send_mysql(name, download_links, logger) time.sleep(0.5) else: print("[%s] error..." % start_url) print("Done.")
def __init__(self, cmd, input_paths, working_dir): """Parameter initiation and work folder creation. Start of progress logging.""" self.working_dir = os.path.abspath(working_dir) self.logger = Logger(os.path.join(self.working_dir, "easyfuse_processing.log")) IOMethods.create_folder(self.working_dir, self.logger) copy(os.path.join(cfg.module_dir, "config.py"), working_dir) self.logger.info("Starting easyfuse: CMD - {}".format(cmd)) self.input_paths = [os.path.abspath(file) for file in input_paths] self.samples = SamplesDB(os.path.join(self.working_dir, "samples.db"))
def __init__(self, scratch_path, fetchdata_path, sample_id): """Parameter initiation and work folder creation.""" self.scratch_path = scratch_path self.fetchdata_path = fetchdata_path self.sample_id = sample_id #self.tools = Samples(os.path.join(scratch_path, os.path.pardir, os.path.pardir, "samples.csv")).get_tool_list_from_state(self.sample_id) self.samples = SamplesDB( os.path.join(scratch_path, os.path.pardir, "samples.db")) self.logger = Logger(os.path.join(self.fetchdata_path, "fetchdata.log"))
def __init__(self): self.__redis_link = self.__redis_connect() mylog = Logger( os.path.join(os.path.abspath(os.path.curdir), 'misc/spider_log.yaml')) self.__logger = mylog.outputLog() self.mysql_connect = mysql_connect.MysqlConnect( os.path.join(os.path.abspath(os.path.curdir), 'misc/mysql_data.yaml')) self.main_url = 'http://www.hanfan.cc/'
def __init__(self, scratch_path, fusion_output_path, sample_id, tool_num_cutoff, fusiontool_list, sample_log): """Parameter initiation and work folder creation.""" self.scratch_path = scratch_path self.fusion_output_path = fusion_output_path self.sample_id = sample_id self.tool_num_cutoff = int(tool_num_cutoff) # urla: if we want to be more generic and allow different annotations, identification of the chr names # (eg "chr1" vs "1" and "chrM" vs "MT") should be performed in advance self.chr_list = ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT") self.tools = fusiontool_list.split(",") self.logger = Logger(sample_log)
class Fusionreadfilter(object): """Select alignments belonging to putative fusions from an s/bam file""" def __init__(self, bam, output): """Parameter initialization""" self.bam_file = bam self.in_bam = pysam.AlignmentFile(bam, "rb") self.filtered_bam = pysam.AlignmentFile(output, "wb", template=self.in_bam) # a list of ints is used to count pairs assigned to different filters # 0: count_input_alignments # 1: count_input_pairs # 2: count_filtered_pairs # 3: count_multimapped # 4: count_star_chimeric_alignments # 5: count_qcd_alignments # 6: count_unmapped # 7: count_10bp_s_clip # 8: count_proper_pair self.counter = [0, 0, 0, 0, 0, 0, 0, 0, 0] self.last_time = 0 self.logger = Logger("{}.fusionReadFilterLog".format(output)) def classify_pair(self, read1, read2, read1_flag, read2_flag, member_count): """Classify a read pairs into different groups""" self.counter[1] += 1 # count star chimeric if member_count > 2: self.counter[4] += 1 return True # count multimapping reads if read1.get_tag("NH") > 1 or read2.get_tag("NH") > 1: self.counter[3] += 1 return False # count pairs containing unmapped mates if read1_flag & 0x4 or read2_flag & 0x4: self.counter[6] += 1 return True # check if the pair is a "proper pair" (both mapped, adequat distance) if not read1_flag & 0x2 or not read2_flag & 0x2: self.counter[8] += 1 return True # check if 10 (default) or more soft-clippings (S) are in the alignment if read1.query_alignment_length + 10 <= read1.query_length or read2.query_alignment_length + 10 <= read2.query_length: self.counter[7] += 1 return True return False def run(self): """Walk linewise through a s/bam file and send proper read pairs to classification""" self.logger.info("Starting fusion read filtering") read1 = read2 = None read_flag = read1_flag = read2_flag = None self.last_time = time.time() last_query = "" count_current_query_member = 0 # read filtering works as follows: # urla: I wrote the processing in a way, that implementing the analyses of multi-mapping reads should be straight forward # For this, however, one would need to check in the group of multi-mapping alignments whether there is no possible "normal mapping" # I could so far think of two possible solutions: (1) running "classify_pair" on all possible combinations of the multimapping pairs, # until a "normal mapping" was found. If this is never the case, the pair is written to the filtered out. This, however, will have # a strong effect on runtimes! (2) with the chimeric multi-mapping setting in the most recent star versions. This should actually # work directly as star chimeric classification outranks multi-mapping disposal. It has, however, not been throughly evaluated by the star community for read in self.in_bam.fetch( until_eof=True ): # iterate through alignments as they appear in the file (this is mandatory becaue (a) we cannot create an index, (b) want to include unmapped reads and (c) have many references in the header during 2nd filtering) self.counter[0] += 1 read_flag = read.flag if last_query != read.query_name and self.counter[0] > 1: if self.classify_pair(read1, read2, read1_flag, read2_flag, count_current_query_member): self.filtered_bam.write(read1) self.filtered_bam.write(read2) self.counter[2] += 1 read1 = read2 = None read1_flag = read2_flag = None count_current_query_member = 0 count_current_query_member += 1 # ignore all alignments which are either supplemental, vendor qc'd, secondary or duplicates # urla: duplicate flagging/removal prior to running this script would most probably lead to several errors # Nevertheless, it is a very fast check and I would also not recommend that someone does deduplication on rna-seq data! if read_flag > 255: self.counter[5] += 1 else: # urla: the following commented version should be fine, but the other one is still kept in order to be more error-aware # if read.is_read1: # read1 = read # else: # read2 = read if not read1 and read.is_read1: read1 = read read1_flag = read_flag elif not read2 and read.is_read2: read2 = read read2_flag = read_flag else: self.logger.error( "Neither r1 nor r2??? Read: {0}; R1: {1}; R2: {2}; bamLine: {3}" .format(read, read1, read2, self.counter[0])) sys.exit(1) # urla: uncomment the following, if you'd like to have stats updates during the run and not only at the end # if count_input_alignments % 1000000 == 0: # self.print_stats(count_input_alignments) last_query = read.query_name # once EOF is reached, the very last pair has to be classified additionally if self.classify_pair(read1, read2, read1_flag, read2_flag, count_current_query_member): self.filtered_bam.write(read1) self.filtered_bam.write(read2) self.counter[2] += 1 # close reading/writing stream self.in_bam.close() self.filtered_bam.close() self.print_stats() self.logger.info("Finished fusion read filtering") def print_stats(self): """print collected statistics to the log file""" this_time = time.time() time_taken = this_time - self.last_time time_taken_1m = float(time_taken * 1000000) / float(self.counter[0]) self.last_time = this_time self.logger.info( "Processed {0} alignments, {1} of {2} pairs remained after filtering ({3:.2%}) ({4:.2f}s / 1M alignments; {5:.2f}s in total)" .format(self.counter[0], self.counter[2], self.counter[1], float(self.counter[2]) / float(self.counter[1]), time_taken_1m, time_taken)) qc1 = False if self.get_input_read_count_from_star() == self.counter[1]: qc1 = True qc2 = False if (self.counter[4] == self.counter[5]) and ( (self.counter[0] - self.counter[5]) * 0.5 == self.counter[1]): qc2 = True # 0: count_input_alignments # 1: count_input_pairs # 2: count_filtered_pairs # 3: count_multimapped # 4: count_star_chimeric_alignments # 5: count_qcd_alignments # 6: count_unmapped # 7: count_10bp_s_clip # 8: count_proper_pair self.logger.info( "Star_chimeric (chim alignment from star):\t{} pairs (filtered)". format(self.counter[4])) self.logger.info( "QC'd (additional Star_chimeric alignment):\t{} alignments (included in above)" .format(self.counter[5])) self.logger.info( "Multimapped (1 < x <= 100 equal mappings):\t{} pairs (discarded)". format(self.counter[3])) self.logger.info( "Unmapped (no mapping or >100 multi map):\t{} pairs (filtered)". format(self.counter[6])) self.logger.info( "No proper pair (unexpected read distance):\t{} pairs (filtered)". format(self.counter[8])) self.logger.info( "10bp_s_clip (>9bp soft-clipped in cigar):\t{} pairs (filtered)". format(self.counter[7])) self.logger.info( "Unlikely chimeric (\"normal\" mappings): \t{} pairs (discarded)". format(self.counter[1] - self.counter[4] - self.counter[3] - self.counter[6] - self.counter[8] - self.counter[7])) self.logger.info( "Filter QC1 (fq reads = bam alignments):\t{}".format(qc1)) self.logger.info( "Filter QC2 (QC'd alignments are chimeric):\t{}".format(qc2)) def get_input_read_count_from_star(self): """Parses a star output log file to get input read counts from the fastq origin""" log_file = "{}Log.final.out".format( self.bam_file.rstrip("Aligned.out.bam")) with open(log_file, "r") as star_log: for line in star_log: if line.split("|")[0].strip() == "Number of input reads": return int(line.split("|")[1].strip()) return -1
parser = argparse.ArgumentParser(description='GAN without MI') parser.add_argument('--config', type=str, default='./configs/spiral_mine.yml', help = 'Path to config file') opts = parser.parse_args() params = get_config(opts.config) print(params) train_loader, val_loader = spiral_dataloader(params) if params['use_mine']: model = GAN_MI(params) else: model = GAN(params) if params['use_cuda']: model = model.cuda() logger = Logger(params['logs']) exp_logs = params['logs'] + params['exp_name'] + '_' + timestamp + '/' exp_results = params['results'] + params['exp_name'] + '_' + timestamp + '/' mkdir_p(exp_logs) mkdir_p(exp_results) if params['use_mine']: gan_trainer = GANTrainerMI(model, params, train_loader, val_loader, logger, exp_results, exp_logs) else: gan_trainer = GANTrainerVanilla(model, params, train_loader, val_loader, logger, exp_results, exp_logs) gan_trainer.train()
class Processing(object): """Run, monitor and schedule fastq processing for fusion gene prediction""" def __init__(self, cmd, input_paths, working_dir): """Parameter initiation and work folder creation. Start of progress logging.""" self.working_dir = os.path.abspath(working_dir) self.logger = Logger(os.path.join(self.working_dir, "easyfuse_processing.log")) IOMethods.create_folder(self.working_dir, self.logger) copy(os.path.join(cfg.module_dir, "config.py"), working_dir) self.logger.info("Starting easyfuse: CMD - {}".format(cmd)) self.input_paths = [os.path.abspath(file) for file in input_paths] self.samples = SamplesDB(os.path.join(self.working_dir, "samples.db")) # The run method simply greps and organises fastq input files. # Fastq pairs (single end input is currently not supported) are then send to "execute_pipeline" def run(self, tool_num_cutoff): """General parameter setting, identification of fastq files and initiation of processing""" self.logger.info("Pipeline Version: {}".format(cfg.version)) # Checking dependencies #VersCont(os.path.join(cfg.module_dir, "dependency_versions.txt")).get_and_print_tool_versions() #self.cfg.run_self_test() # urla: organism is currently not used for anything, however, this might change; is mouse processing relevant at some point? ref_genome = cfg.ref_genome_build ref_trans = cfg.ref_trans_version self.logger.info("Reference Genome: {0}, Reference Transcriptome: {1}".format(ref_genome, ref_trans)) # if self.overwrite: # self.logger.info("#############################################################################") # self.logger.info("") # self.logger.info("Overwrite flag is set => all previously existing results may be overwritten!") # self.logger.info("") # self.logger.info("#############################################################################") sample_list = [] # get fastq files left, right, sample_id = IOMethods.get_fastq_files(self.input_paths, self.logger) sample_list = sample_id for i, _ in enumerate(left): if len(left) == len(right): self.logger.info("Processing Sample ID: {} (paired end)".format(sample_id[i])) self.logger.info("Sample 1: {}".format(left[i])) self.logger.info("Sample 2: {}".format(right[i])) self.execute_pipeline(left[i], right[i], sample_id[i], ref_genome, ref_trans, tool_num_cutoff) # summarize all data if selected if "Summary" in cfg.tools: #dependency = [Queueing.get_jobs_by_name("Fetchdata-{}".format(sample)) for sample in sample_list] # urla - note: would be happy to get the dependencies with a stacked LC, but is atm to complicated for me ^^ dependency = [] for sample in sample_list: dependency.extend(Queueing.get_jobs_by_name("Fetchdata-{}".format(sample), cfg.queueing_system)) modelling_string = "" if cfg.other_files["easyfuse_model"]: modelling_string = " --model_predictions" cmd_summarize = "python {0} --input {1}{2}".format(os.path.join(cfg.module_dir, "summarize_data.py"), self.working_dir, modelling_string) self.logger.debug("Submitting slurm job: CMD - {0}; PATH - {1}; DEPS - {2}".format(cmd_summarize, self.working_dir, dependency)) cpu = cfg.resources["summary"]["cpu"] mem = cfg.resources["summary"]["mem"] self.submit_job("-".join([cfg.pipeline_name, "Summary", str(int(round(time.time())))]), cmd_summarize, cpu, mem, self.working_dir, dependency, cfg.receiver) # Per sample, define input parameters and execution commands, create a folder tree and submit runs to slurm def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_cutoff): """Create sample specific subfolder structure and run tools on fastq files""" self.samples.add_sample(sample_id, "NA", fq1, fq2) refs = cfg.references # Genome/Gene references to use genome_sizes_path = refs["genome_sizes"] genome_chrs_path = refs["genome_fastadir"] genes_fasta_path = refs["genes_fasta"] genes_gtf_path = refs["genes_gtf"] # Path' to specific indices indices = cfg.indices other_files = cfg.other_files bowtie_index_path = indices["bowtie"] star_index_path = indices["star"] # kallisto_index_path = indices["kallisto"] # pizzly_cache_path = "{}.pizzlyCache.txt".format(genes_gtf_path) starfusion_index_path = indices["starfusion"] fusioncatcher_index_path = indices["fusioncatcher"] infusion_cfg_path = other_files["infusion_cfg"] # starchip_param_path = other_files["starchip_param"] # Output results folder creation - currently included: # 1) Gene/Isoform expression: kallisto, star # 2) Fusion prediction: mapsplice, pizzly, fusioncatcher, star-fusion, starchip, infusion output_results_path = os.path.join(self.working_dir, "Sample_{}".format(sample_id)) qc_path = os.path.join(output_results_path, "qc") skewer_path = os.path.join(qc_path, "skewer") qc_table_path = os.path.join(qc_path, "qc_table.txt") overrepresented_path = os.path.join(qc_path, "overrepresented.fa") filtered_reads_path = os.path.join(output_results_path, "filtered_reads") expression_path = os.path.join(output_results_path, "expression") # kallisto_path = os.path.join(expression_path, "kallisto") star_path = os.path.join(expression_path, "star") fusion_path = os.path.join(output_results_path, "fusion") mapsplice_path = os.path.join(fusion_path, "mapsplice") pizzly_path = os.path.join(fusion_path, "pizzly") fusioncatcher_path = os.path.join(fusion_path, "fusioncatcher") starfusion_path = os.path.join(fusion_path, "starfusion") starchip_path = os.path.join(fusion_path, "starchip") infusion_path = os.path.join(fusion_path, "infusion") soapfuse_path = os.path.join(fusion_path, "soapfuse") fetchdata_path = os.path.join(self.working_dir, "Sample_{}".format(sample_id), "fetchdata") fastqc_1 = os.path.join(qc_path, os.path.basename(fq1).rstrip(".fastq.gz") + "_fastqc", "fastqc_data.txt") fastqc_2 = os.path.join(qc_path, os.path.basename(fq2).rstrip(".fastq.gz") + "_fastqc", "fastqc_data.txt") for folder in [ output_results_path, qc_path, skewer_path, filtered_reads_path, expression_path, # kallisto_path, star_path, fusion_path, mapsplice_path, # pizzly_path, fusioncatcher_path, starfusion_path, # starchip_path, infusion_path, soapfuse_path, fetchdata_path ]: IOMethods.create_folder(folder, self.logger) # get a list of tools from the samples.db file that have been run previously on this sample state_tools = self.samples.get_tool_list_from_state(sample_id) # get a list of tools from the config file which shall be run on this sample tools = cfg.tools cmds = cfg.commands module_dir = cfg.module_dir # Define cmd strings for each program # urla: mapsplice requires gunzip'd read files and process substitutions don't seem to work in slurm scripts... # process substitution do somehow not work from this script - c/p the command line to the terminal, however, works w/o issues?! cmd_fastqc = "{0} --nogroup --extract -t 6 -o {1} {2} {3}".format(cmds["fastqc"], qc_path, fq1, fq2) cmd_qc_parser = "{0} -i {1} {2} -o {3}".format(os.path.join(module_dir, "misc", "qc_parser.py"), fastqc_1, fastqc_2, qc_table_path) cmd_skewer = "{0} -q {1} -i {2} {3} -o {4}".format(os.path.join(module_dir, "tool_wrapper", "skewer_wrapper.py"), qc_table_path, fq1, fq2, skewer_path) fq0 = "" if "QC" in tools: fq0 = os.path.join(skewer_path, "out_file-trimmed.fastq.gz") fq1 = os.path.join(skewer_path, "out_file-trimmed-pair1.fastq.gz") fq2 = os.path.join(skewer_path, "out_file-trimmed-pair2.fastq.gz") else: qc_table_path = "None" # (0) Readfilter cmd_star_filter = "{0} --genomeDir {1} --outFileNamePrefix {2}_ --readFilesCommand zcat --readFilesIn {3} {4} --outFilterMultimapNmax 100 --outSAMmultNmax 1 --chimSegmentMin 10 --chimJunctionOverhangMin 10 --alignSJDBoverhangMin 10 --alignMatesGapMax {5} --alignIntronMax {5} --chimSegmentReadGapMax 3 --alignSJstitchMismatchNmax 5 -1 5 5 --seedSearchStartLmax 20 --winAnchorMultimapNmax 50 --outSAMtype BAM Unsorted --chimOutType Junctions WithinBAM --outSAMunmapped Within KeepPairs --runThreadN waiting_for_cpu_number".format(cmds["star"], star_index_path, os.path.join(filtered_reads_path, sample_id), fq1, fq2, cfg.max_dist_proper_pair) cmd_read_filter = "{0} --input {1}_Aligned.out.bam --output {1}_Aligned.out.filtered.bam".format(os.path.join(module_dir, "fusionreadfilter.py"), os.path.join(filtered_reads_path, sample_id)) # re-define fastq's if filtering is on (default) fq0 = "" if "Readfilter" in tools: fq0 = os.path.join(filtered_reads_path, os.path.basename(fq1).replace("R1", "R0").replace(".fastq.gz", "_filtered_singles.fastq.gz")) fq1 = os.path.join(filtered_reads_path, os.path.basename(fq1).replace(".fastq.gz", "_filtered.fastq.gz")) fq2 = os.path.join(filtered_reads_path, os.path.basename(fq2).replace(".fastq.gz", "_filtered.fastq.gz")) cmd_bam_to_fastq = "{0} fastq -0 {1} -1 {2} -2 {3} --threads waiting_for_cpu_number {4}_Aligned.out.filtered.bam".format(cmds["samtools"], fq0, fq1, fq2, os.path.join(filtered_reads_path, sample_id)) # (1) Kallisto expression quantification (required for pizzly) # cmd_kallisto = "{0} quant --threads waiting_for_cpu_number --genomebam --gtf {1} --chromosomes {2} --index {3} --fusion --output-dir waiting_for_output_string {4} {5}".format(cmds["kallisto"], genes_gtf_path, genome_sizes_path, kallisto_index_path, fq1, fq2) # (2) Star expression quantification (required for starfusion and starchip) cmd_star = "{0} --genomeDir {1} --outFileNamePrefix waiting_for_output_string --runThreadN waiting_for_cpu_number --runMode alignReads --readFilesIn {2} {3} --readFilesCommand zcat --chimSegmentMin 10 --chimJunctionOverhangMin 10 --alignSJDBoverhangMin 10 --alignMatesGapMax {4} --alignIntronMax {4} --chimSegmentReadGapMax 3 --alignSJstitchMismatchNmax 5 -1 5 5 --seedSearchStartLmax 20 --winAnchorMultimapNmax 50 --outSAMtype BAM SortedByCoordinate --chimOutType Junctions SeparateSAMold --chimOutJunctionFormat 1".format(cmds["star"], star_index_path, fq1, fq2, cfg.max_dist_proper_pair) # (3) Mapslice # urla: the "keep" parameter requires gunzip >= 1.6 cmd_extr_fastq1 = "gunzip --keep {0}".format(fq1) cmd_extr_fastq2 = "gunzip --keep {0}".format(fq2) # Added python interpreter to circumvent external hardcoded shell script cmd_mapsplice = "{0} --chromosome-dir {1} -x {2} -1 {3} -2 {4} --threads waiting_for_cpu_number --output {5} --qual-scale phred33 --bam --seglen 20 --min-map-len 40 --gene-gtf {6} --fusion".format(cmds["mapsplice"], genome_chrs_path, bowtie_index_path, fq1[:-3], fq2[:-3], mapsplice_path, genes_gtf_path) # (4) Fusiocatcher cmd_fusioncatcher = "{0} --input {1} --data {2} --output {3} -p waiting_for_cpu_number".format(cmds["fusioncatcher"], ",".join([fq1, fq2]), fusioncatcher_index_path, fusioncatcher_path) # star-fusion and star-chip can be run upon a previous star run (this MUST NOT be the star_filter run, but the star_expression run) # (5) cmd_starfusion = "{0} --chimeric_junction {1} --genome_lib_dir {2} --CPU waiting_for_cpu_number --output_dir {3}".format(cmds["starfusion"], "{}_Chimeric.out.junction".format(os.path.join(star_path, sample_id)), starfusion_index_path, starfusion_path) # (7) # cmd_starchip = "{0} {1} {2} {3}".format(cmds["starchip"], os.path.join(starchip_path, "starchip"), "{}_Chimeric.out.junction".format(os.path.join(star_path, sample_id)), starchip_param_path) # (6) Pizzly # cmd_pizzly = "{0} -k 29 --gtf {1} --cache {2} --fasta {3} --output {4} {5}".format(cmds["pizzly"], genes_gtf_path, pizzly_cache_path, genes_fasta_path, os.path.join(pizzly_path, "kallizzy"), os.path.join(kallisto_path, "fusion.txt")) # cmd_pizzly2 = "{0} {1} {2}".format(cmds["pizzly_cmd2"], "{}.json".format(os.path.join(pizzly_path, "kallizzy")), "{}.json.txt".format(os.path.join(pizzly_path, "kallizzy"))) # (8) Infusion cmd_infusion = "{0} -1 {1} -2 {2} --skip-finished --min-unique-alignment-rate 0 --min-unique-split-reads 0 --allow-non-coding --out-dir {3} {4}".format(cmds["infusion"], fq1, fq2, infusion_path, infusion_cfg_path) # (x) Soapfuse cmd_soapfuse = "{0} -q {1} -i {2} -o {3}".format(os.path.join(module_dir, "tool_wrapper", "soapfuse_wrapper.py"), qc_table_path, " ".join([fq1, fq2]), soapfuse_path) # (9) Data collection cmd_fetchdata = "{0} -i {1} -o {2} -s {3} --fq1 {4} --fq2 {5} --fusion_support {6}".format(os.path.join(module_dir, "fetchdata.py"), output_results_path, fetchdata_path, sample_id, fq1, fq2, tool_num_cutoff) # (10) De novo assembly of fusion transcripts # urla: This is currently still under active development and has not been tested thoroughly # cmd_denovoassembly = "{0} -i waiting_for_gene_list_input -b {1}_Aligned.out.bam -g {2} -t {3} -o waiting_for_assembly_out_dir".format(os.path.join(module_dir, "denovoassembly.py"), os.path.join(filtered_reads_path, sample_id), ref_genome, ref_trans) # (X) Sample monitoring cmd_samples = "{0} --db_path={1} --sample_id={2} --action=append_state --tool=".format(os.path.join(module_dir, "misc", "samples.py"), self.samples.db_path, sample_id) # set final lists of executable tools and path exe_tools = [ "QC", #0 "Readfilter", #1 # "Kallisto", #2 "Star", #3 "Mapsplice", #4 "Fusioncatcher", #5 "Starfusion", #6 # "Pizzly", #7 # "Starchip", #8 "Infusion", #9 "Soapfuse", #10 "Fetchdata" #11 # "Assembly" #12 ] exe_cmds = [ " && ".join([cmd_fastqc, cmd_qc_parser, cmd_skewer]), #0 " && ".join([cmd_star_filter, cmd_read_filter, cmd_bam_to_fastq]), #1 # cmd_kallisto, #2 cmd_star, #3 " && ".join([cmd_extr_fastq1, cmd_extr_fastq2, cmd_mapsplice]), #4 cmd_fusioncatcher, #5 cmd_starfusion, #6 # " && ".join([cmd_pizzly, cmd_pizzly2]), #7 # cmd_starchip, #8 cmd_infusion, #9 cmd_soapfuse, #10 cmd_fetchdata #11 # cmd_denovoassembly #12 ] exe_path = [ qc_path, #0 filtered_reads_path, #1 # kallisto_path, #2 star_path, #3 mapsplice_path, #4 fusioncatcher_path, #5 starfusion_path, #6 # pizzly_path, #7 # starchip_path, #8 infusion_path, #9 soapfuse_path, #10 fetchdata_path #11 # "" #12 ] # create and submit slurm job if the tool is requested and hasn't been run before for i, tool in enumerate(exe_tools, 0): if tool in tools: dependency = [] # check dependencies of the pipeline. # Besides tool dependencies (Pizzly -> Kallisto, Starfusion/Starchip -> Star), read filtering is mandatory # Processing will be skipped if a certain dependency was not found (either pre-processed data of the configs tool string are checked) if tool in state_tools: # urla: the primary idea behind this flag is to allow multiple fetchdata executions during processing # nevertheless, re-processing of the same data with a newer version of a tool will also be straightforward (but overwriting previous results, of course) # if self.overwrite: # self.logger.info("Executing {0} although it looks like a previous run finished successfully. Results in {1} may be overwritten".format(tool, exe_path[i])) # else: self.logger.info("Skipping {0} as it looks like a previous run finished successfully. Results should be in {1}".format(tool, exe_path[i])) continue else: if tool == "Readfilter" and "Readfilter" not in tools: self.logger.error( """Error 99: Sample {} will be skipped due to missing read filtering.\n Read filtering is currently a mandatory step for the processing.\n Because you haven't run it before for this sample, you have to include \"Readfilter\" in the tool selection in your config.\n """.format(sample_id)) print("Error 99: Sample {} will be skipped due to missing read filtering.".format(sample_id)) return 0 elif tool == "Pizzly" and "Kallisto" not in tools: self.logger.error( """Error 99: Running {0} for Sample {1} will be skipped due to a missing dependency.\n Pizzly builds on Kallisto and it is therefore mandatory to run this first.\n Because you haven't run it before for this sample, you have to include \"Kallisto\" in the tool selection in your config.\n """.format(sample_id)) print("Error 99: Running {0} for Sample {1} will be skipped due to a missing dependency.".format(tool, sample_id)) continue elif (tool == "Starfusion" or tool == "Starchip") and "Star" not in tools: self.logger.error( """Error 99: Running {0} for Sample {1} will be skipped due to a missing dependency.\n {0} builds on Star and it is therefore mandatory to run this first.\n Because you haven't run it before for this sample, you have to include \"Star\" in the tool selection in your config.\n """.format(sample_id)) print("Error 99: Running {0} for Sample {1} will be skipped due to a missing dependency.".format(tool, sample_id)) continue # prepare slurm jobs: get ressources, create uid, set output path and check dependencies self.logger.debug("Submitting {} run to slurm".format(tool)) cpu = cfg.resources[tool.lower()]["cpu"] mem = cfg.resources[tool.lower()]["mem"] uid = "-".join([cfg.pipeline_name, tool, sample_id]) if tool == "Star": exe_cmds[i] = exe_cmds[i].replace("waiting_for_output_string", "{}_".format(os.path.join(exe_path[i], sample_id))).replace("waiting_for_cpu_number", str(cpu)) else: exe_cmds[i] = exe_cmds[i].replace("waiting_for_output_string", exe_path[i]).replace("waiting_for_cpu_number", str(cpu)) cmd = " && ".join([exe_cmds[i], cmd_samples + tool]) # Managing slurm dependencies que_sys = cfg.queueing_system if tool == "Pizzly": dependency = Queueing.get_jobs_by_name("Kallisto-{0}".format(sample_id), que_sys) elif tool == "Starfusion" or tool == "Starchip": dependency = Queueing.get_jobs_by_name("Star-{0}".format(sample_id), que_sys) elif tool == "Fetchdata": dependency = Queueing.get_jobs_by_name(sample_id, que_sys) elif tool == "Assembly": dependency = Queueing.get_jobs_by_name("Fetchdata-{0}".format(sample_id), que_sys) elif tool == "ReadFilter": dependency = Queueing.get_jobs_by_name("QC-{0}".format(sample_id), que_sys) dependency.extend(Queueing.get_jobs_by_name("Readfilter-{0}".format(sample_id), que_sys)) dependency.extend(Queueing.get_jobs_by_name("QC-{0}".format(sample_id), que_sys)) self.logger.debug("Submitting slurm job: CMD - {0}; PATH - {1}; DEPS - {2}".format(cmd, exe_path[i], dependency)) self.submit_job(uid, cmd, cpu, mem, exe_path[i], dependency, "") else: self.logger.info("Skipping {0} as it is not selected for execution (Selected are: {1})".format(tool, tools)) def submit_job(self, uid, cmd, cores, mem_usage, output_results_folder, dependencies, mail): """Submit job to slurm scheduling""" que_sys = cfg.queueing_system already_running = Queueing.get_jobs_by_name(uid, que_sys) if not already_running: # urla: for compatibility reasons (and to be independent of shell commands), concatenated commands are splitted again, # dependencies within the splitted groups updated and everything submitted sequentially to the queueing system module_file = os.path.join(cfg.module_dir, "build_env.sh") for i, cmd_split in enumerate(cmd.split(" && ")): if not que_sys in ["slurm", "pbs"]: cmd_split = cmd_split.split(" ") dependencies.extend(Queueing.get_jobs_by_name("{0}_CMD{1}".format(uid, i - 1), que_sys)) Queueing.submit("{0}_CMD{1}".format(uid, i), cmd_split, cores, mem_usage, output_results_folder, dependencies, cfg.partition, cfg.user, cfg.time_limit, mail, module_file, que_sys) time.sleep(0.5) else: self.logger.error("A job with this application/sample combination is currently running. Skipping {} in order to avoid unintended data loss.".format(uid))
class Fetching(object): """Run, monitor and schedule fastq processing for fusion gene prediction""" def __init__(self, scratch_path, fetchdata_path, sample_id): """Parameter initiation and work folder creation.""" self.scratch_path = scratch_path self.fetchdata_path = fetchdata_path self.sample_id = sample_id #self.tools = Samples(os.path.join(scratch_path, os.path.pardir, os.path.pardir, "samples.csv")).get_tool_list_from_state(self.sample_id) self.samples = SamplesDB( os.path.join(scratch_path, os.path.pardir, "samples.db")) self.logger = Logger(os.path.join(self.fetchdata_path, "fetchdata.log")) def get_pseudo_genome_adjustments_for_star(self, num_len_file): # wrong pylint error due to long name => pylint: disable=C0103 """Return the genome size of an associated fasta file calculated by urla_GetFusionSequence_latest.R""" seq_num = 0 genome_size = 0 with open(num_len_file) as lfile: seq_num = int(lfile.next()) genome_size = int(lfile.next()) star_genome_chr_bin_n_bits = min( 18, int(math.log(genome_size / seq_num, 2))) star_genome_sa_index_n_bases = min( 14, int(math.log(genome_size, 2) / 2 - 1)) - 2 self.logger.debug( "Custom genome sequence number: {0} => {1} will be used as bin size parameter for genome storage" .format(seq_num, star_genome_chr_bin_n_bits)) self.logger.debug( "Custom Genome Size: {0} bp => {1} will be used as length parameter for SA pre-indexing" .format(genome_size, star_genome_sa_index_n_bases)) return (str(star_genome_chr_bin_n_bits), str(star_genome_sa_index_n_bases)) @staticmethod def get_input_read_count_from_star(star_out_bam): """Parses a star output log file to get input read counts from the fastq origin""" log_file = "{}Log.final.out".format( star_out_bam.rstrip("Aligned.out.bam")) if not os.path.exists(log_file): return -1 with open(log_file, "r") as star_log: for line in star_log: if line.split("|")[0].strip() == "Number of input reads": return int(line.split("|")[1].strip()) return -1 @staticmethod def get_input_read_count_from_fastq(fastq): """Parses input FASTQ to get read count""" ps = subprocess.Popen(("zcat", fastq), stdout=subprocess.PIPE) result = subprocess.check_output(("wc", "-l"), stdin=ps.stdout) return int(result) / 4 def run(self, fusion_support, fq1, fq2): """Identification of fastq files and initiation of processing""" # print sample id # execute processing pipe # sampleID = ... self.logger.info("Fetching in sample {}".format(self.sample_id)) if not fq1 or not fq2: self.logger.debug( "Either ReadFile 1 or 2 or both are missing, trying to get original files from samples.csv" ) self.logger.debug(self.sample_id) self.logger.debug(self.samples.db_path) (fq1, fq2) = self.samples.get_fastq_files(self.sample_id) self.execute_pipeline(fq1, fq2, fusion_support) # urla: there are a lot of local variables declarated in the following method. # Although this could be reduced quite strongly, readability would be strongly reduced as well # pylint:disable=R0914 def execute_pipeline(self, fq1, fq2, fusion_support): """Create sample specific subfolder structuge and run tools on fastq files""" # Genome/Gene references to use ref_trans = cfg.ref_trans_version ref_genome = cfg.ref_genome_build genome_fasta_path = cfg.references["genome_fasta"] genes_adb_path = cfg.references["genes_adb"] genes_tsl_path = cfg.references["genes_tsl"] fetchdata_current_path = os.path.join( self.fetchdata_path, "fd_{}_tool".format(fusion_support)) detected_fusions_path = os.path.join(fetchdata_current_path, "fetched_fusions") detected_fusions_file = os.path.join(detected_fusions_path, "Detected_Fusions.csv") context_seq_path = os.path.join(fetchdata_current_path, "fetched_contextseqs") context_seq_file = os.path.join(context_seq_path, "Context_Seqs.csv") filtered_reads_path = os.path.join(self.scratch_path, "filtered_reads") star_genome_path = os.path.join(context_seq_path, "STAR_idx") star_align_path = os.path.join(context_seq_path, "STAR_align") star_align_file = os.path.join(star_align_path, "{}_".format(self.sample_id)) classification_path = os.path.join(fetchdata_current_path, "classification") classification_file = os.path.join(classification_path, "classification") for folder in [ fetchdata_current_path, detected_fusions_path, context_seq_path, star_genome_path, star_align_path, classification_path ]: IOMethods.create_folder(folder, self.logger) # processing steps to perform tools = cfg.fd_tools fusion_tools = cfg.tools module_dir = cfg.module_dir cmds = cfg.commands # In case of a liftover, some reference and path must be changed accordingly cmd_contextseq_org = "" if "Liftover" in tools: tools.insert(2, "ContextSeqBak") # for read grepping, we need the original reference on which the first mapping was performed cmd_contextseq_org = "python {0} --detected_fusions {1}.bak --annotation_db {2} --out_csv {3}.bak --genome_fasta {4} --tsl_info {5} --cis_near_dist {6} --context_seq_len {7} --tsl_filter_level {8}".format( os.path.join(module_dir, "fusionannotation.py"), detected_fusions_file, genes_adb_path, context_seq_file, genome_fasta_path, genes_tsl_path, cfg.cis_near_distance, cfg.context_seq_len, cfg.tsl_filter) # now, references need to be updated according to the target liftover crossmap_chain = cfg.liftover["crossmap_chain"] ref_genome_dest = os.path.basename(crossmap_chain).replace( ".", "_").split("_")[2].lower() self.logger.debug( "Creating a copy of the detected fusions file due to selection of liftover. Old ({0}) data will be kept in \"{1}.bak\"" .format(ref_genome, detected_fusions_file)) genome_fasta_path = cfg.references["genome_fasta_hg37"] genes_adb_path = cfg.references["genes_adb_hg37"] # urla - note: tmp hack to get original star input reads for normalization with open( os.path.join(classification_path, "Star_org_input_reads.txt"), "w") as infile: read_count = self.get_input_read_count_from_star( os.path.join(filtered_reads_path, "{}_Aligned.out.bam".format(self.sample_id))) if read_count == -1: read_count = self.get_input_read_count_from_fastq(fq1) infile.write(str(read_count)) # Define cmd strings for each program cmd_fusiondata = "{0} -i {1} -o {2} -s {3} -t {4} -f {5} -l {6}".format( os.path.join(module_dir, "fusiontoolparser.py"), self.scratch_path, detected_fusions_path, self.sample_id, fusion_support, ",".join(cfg.fusiontools), self.logger.get_path()) cmd_liftover = "{0} -i {1} -l {2}".format( os.path.join(module_dir, "misc", "liftover.py"), detected_fusions_file, self.logger.get_path()) cmd_contextseq = "{0} --detected_fusions {1} --annotation_db {2} --out_csv {3} --genome_fasta {4} --tsl_info {5} --cis_near_dist {6} --context_seq_len {7} --tsl_filter_level {8}".format( os.path.join(module_dir, "fusionannotation.py"), detected_fusions_file, genes_adb_path, context_seq_file, genome_fasta_path, genes_tsl_path, cfg.cis_near_distance, cfg.context_seq_len, cfg.tsl_filter) cpu = cfg.resources["fetchdata"]["cpu"] # mem = cfg.resources["fetchdata"]["mem"] cmd_starindex = "{0} --runMode genomeGenerate --runThreadN {1} --limitGenomeGenerateRAM 48000000000 --genomeChrBinNbits waiting_for_bin_size_input --genomeSAindexNbases waiting_for_sa_idx_input --genomeDir {2} --genomeFastaFiles {3}".format( cmds["star"], cpu, star_genome_path, "{0}{1}".format(context_seq_file, ".fasta")) cmd_staralign_fltr = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}fltr_ --limitBAMsortRAM 48000000000".format( cmds["star"], star_genome_path, fq1, fq2, cpu, star_align_file) cmd_bamindex_fltr = "{0} index {1}fltr_Aligned.sortedByCoord.out.bam".format( cmds["samtools"], star_align_file) cmd_requantify_fltr = "{0} -i {1}fltr_Aligned.sortedByCoord.out.bam -o {2}_fltr.tdt -d 10".format( os.path.join(module_dir, "requantify.py"), star_align_file, classification_file) (fq1, fq2) = self.samples.get_fastq_files(self.sample_id) cmd_staralign_org = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}org_ --limitBAMsortRAM 48000000000".format( cmds["star"], star_genome_path, fq1, fq2, cpu, star_align_file) cmd_bamindex_org = "{0} index {1}org_Aligned.sortedByCoord.out.bam".format( cmds["samtools"], star_align_file) cmd_requantify_org = "{0} -i {1}org_Aligned.sortedByCoord.out.bam -o {2}_org.tdt -d 10".format( os.path.join(module_dir, "requantify.py"), star_align_file, classification_file) # for testing, based on debug. should be removed if merged to original cmd_read_filter2 = "{0} --input {1}_Aligned.out.bam --input2 {2}.debug --output {1}_Aligned.out.filtered2.bam".format( os.path.join(module_dir, "getRequantReads.py"), os.path.join(filtered_reads_path, self.sample_id), context_seq_file) # re-define fastq's if filtering is on (default) fq0 = "" if "Readfilter" in fusion_tools: fq0 = os.path.join( filtered_reads_path, os.path.basename(fq1).replace("R1", "R0").replace( ".fastq.gz", "_filtered2_singles.fastq.gz")) fq1 = os.path.join( filtered_reads_path, os.path.basename(fq1).replace(".fastq.gz", "_filtered2.fastq.gz")) fq2 = os.path.join( filtered_reads_path, os.path.basename(fq2).replace(".fastq.gz", "_filtered2.fastq.gz")) cmd_bam_to_fastq = "{0} fastq -0 {1} -1 {2} -2 {3} --threads {5} {4}_Aligned.out.filtered2.bam".format( cmds["samtools"], fq0, fq1, fq2, os.path.join(filtered_reads_path, self.sample_id), cpu) # allow soft-clipping? Specificity? --alignEndsType EndToEnd cmd_staralign_best = "{0} --genomeDir {1} --readFilesCommand zcat --readFilesIn {2} {3} --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax -1 --outSAMattributes Standard --outSAMunmapped None --outFilterMismatchNoverLmax 0.02 --runThreadN {4} --outFileNamePrefix {5}best_ --limitBAMsortRAM 48000000000".format( cmds["star"], star_genome_path, fq1, fq2, cpu, star_align_file) cmd_bamindex_best = "{0} index {1}best_Aligned.sortedByCoord.out.bam".format( cmds["samtools"], star_align_file) cmd_requantify_best = "{0} -i {1}best_Aligned.sortedByCoord.out.bam -o {2}_best.tdt -d 10".format( os.path.join(module_dir, "requantify.py"), star_align_file, classification_file) # set final lists of executable tools and path exe_tools = [ "Fusiongrep", #1 "Liftover", #2 "ContextSeqBak", "Contextseq", #3 "Starindex", #4 "StaralignFltr", #5 "BamindexFltr", #6 "RequantifyFltr", #7 "StaralignOrg", #8 "BamindexOrg", #9 "RequantifyOrg", #10 "ReadFilter2", #11 "ReadFilter2b", #12 "StaralignBest", #13 "BamindexBest", #14 "RequantifyBest" #15 ] exe_cmds = [ cmd_fusiondata, #1 cmd_liftover, #2 cmd_contextseq_org, cmd_contextseq, #3 cmd_starindex, #4 cmd_staralign_fltr, #5 cmd_bamindex_fltr, #6 cmd_requantify_fltr, #7 cmd_staralign_org, #8 cmd_bamindex_org, #9 cmd_requantify_org, #10 cmd_read_filter2, #11 cmd_bam_to_fastq, #12 cmd_staralign_best, #13 cmd_bamindex_best, #14 cmd_requantify_best #15 ] exe_dependencies = [ "", #1 detected_fusions_file, #2 detected_fusions_file, detected_fusions_file, #3 "{0}{1}".format(context_seq_file, ".fasta.info"), #4 star_genome_path, #5 "{}fltr_Aligned.sortedByCoord.out.bam".format(star_align_file), #6 "", #7 star_genome_path, #8 "{}org_Aligned.sortedByCoord.out.bam".format(star_align_file), #9 "", #10 "", #11 "", #12 star_genome_path, #13 "{}best_Aligned.sortedByCoord.out.bam".format( star_align_file), #14 "" #15 ] # create and submit slurm job if the tool is requested and hasn't been run before module_file = os.path.join(module_dir, "build_env.sh") for i, tool in enumerate(exe_tools, 0): if tool in tools: if not exe_dependencies[i] or os.path.exists( exe_dependencies[i]): self.logger.info("Starting {}".format(tool)) if tool == "Starindex": # the genome size required for the genomeSAindexNbases parameter is not known before now (star_bin, star_sa ) = self.get_pseudo_genome_adjustments_for_star( "{0}{1}".format(context_seq_file, ".fasta.info")) exe_cmds[i] = exe_cmds[i].replace( "waiting_for_bin_size_input", star_bin) exe_cmds[i] = exe_cmds[i].replace( "waiting_for_sa_idx_input", star_sa) self.logger.debug("Executing: {}".format(exe_cmds[i])) Queueing.submit("", exe_cmds[i].split(" "), "", "", "", "", "", "", "", "", module_file, "none") else: self.logger.error( "Could not run {0} due to the missing dependency {1}". format(tool, exe_dependencies[i])) sys.exit(1) else: self.logger.debug( "Skipping {0} as it is not selected for execution (Selected are: {1})" .format(tool, tools))
'%d_%m_%Y_%H_%M_%S') parser = argparse.ArgumentParser(description='mine') parser.add_argument('--config', type=str, default='./configs/mine.yml', help='Path to config file') opts = parser.parse_args() params = get_config(opts.config) print(params) model = Mine(params) if params['use_cuda']: model = model.cuda() if params['training'] == True and params['visualize'] == False: exp_logs = params['logs'] + params['exp_name'] + '_' + timestamp + '/' exp_results = params['results'] + params[ 'exp_name'] + '_' + timestamp + '/' mkdir_p(exp_logs) mkdir_p(exp_results) config_logfile = exp_logs + 'config.json' with open(config_logfile, 'w+') as cf: json.dump(params, cf) optimizer = optim.Adam(model.parameters(), lr=params['lr']) logger = Logger(exp_logs) train(params)
model = DeepCross(opt=opt) model = model.cuda() if opt.loader: print("load checkpoint file .") model.load_state_dict( torch.load(os.path.join('models', 'model-1.ckpt'))) current_lr = 1e-3 optimizer = optim.Adam(model.parameters(), lr=current_lr) # criterion = nn.BCEWithLogitsLoss() criterion = FocalLoss() # criterion = nn.BCELoss() logger = Logger('./logs/') for epoch in range(2, opt.num_epoches): # schedule learning rate frac = epoch // 2 decay_factor = 0.9**frac current_lr = current_lr * decay_factor utils.set_lr(optimizer, current_lr) # training model.train() start = time.time() for i, data in enumerate(train_loader): # prepare data and corresponding label(which is 'click') user_id = data['user_id'].cuda()
def main(): mylog = Logger(os.path.join(os.path.abspath(os.path.curdir),'misc/spider_log.yaml')) logger = mylog.outputLog() items = spiderman() for item in items: send_mysql(item, logger)
def train(model, vocab, cfg): seqtree_coco = SeqtreeCOCO() loader = DataLoader(seqtree_coco, batch_size=16, shuffle=True, num_workers=4) logdir = os.path.join(cfg.checkpoint_path, cfg.id) if not os.path.isdir(logdir): os.mkdir(logdir) logger = Logger(logdir) with open(os.path.join(logdir, 'config.txt'), 'w') as f: f.write(str(cfg)) with open('data/idx2caps.json', 'r') as f: cocoid2caps = json.load(f) cocoid2caps = {int(k): v for k, v in cocoid2caps.items()} init_scorer('coco-train-idxs') infos = {} # if cfg.start_from is not None: # with open(os.path.join(cfg.start_from, 'infos_' + cfg.start_from + '_best.pkl'), 'rb') as f: # infos = pickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = infos.get('val_result_history', {}) loss_history = infos.get('loss_history', {}) lr_history = infos.get('lr_history', {}) best_val_score = 0 update_lr_flag = True if cfg.caption_model == 'att_model' or cfg.caption_model == 'tree_model' \ or cfg.caption_model == 'tree_model_1' or cfg.caption_model == 'tree_model_md' \ or cfg.caption_model == 'tree_model_2' or cfg.caption_model == 'tree_model_md_att' \ or cfg.caption_model == 'tree_model_md_sob' or cfg.caption_model == 'tree_model_md_in' \ or cfg.caption_model == 'drnn': # crit = nn.CrossEntropyLoss() crit = LanguageModelCriterion() rl_crit = RewardCriterion() else: raise Exception("Caption model not supported: {}".format( cfg.caption_model)) optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate) num_period_best = 0 current_score = 0 start = time.time() print("start training...") while True: if update_lr_flag: if epoch > cfg.learning_rate_decay_start >= 0: frac = (epoch - cfg.learning_rate_decay_start ) // cfg.learning_rate_decay_every decay_factor = cfg.learning_rate_decay_rate**frac cfg.current_lr = cfg.learning_rate * decay_factor utils.set_lr(optimizer, cfg.current_lr) else: cfg.current_lr = cfg.learning_rate optimizer.zero_grad() for data in loader: if cfg.use_cuda: torch.cuda.synchronize() if cfg.caption_model == 'tree_model_md_att': temp = [ data['word_idx'], data['father_idx'], data['masks'], data['fc_feats'], data['att_feats'] ] temp = [_.cuda() for _ in temp] word_idx, father_idx, masks, fc_feats, att_feats = temp elif cfg.caption_model == 'tree_model_md' or cfg.caption_model == 'tree_model_md_sob' \ or cfg.caption_model == 'tree_model_md_in' or cfg.caption_model == 'drnn': temp = [ data['word_idx'], data['father_idx'], data['masks'], data['fc_feats'] ] temp = [_.cuda() for _ in temp] word_idx, father_idx, masks, fc_feats = temp # words = [[vocab.idx2word[word_idx[batch_index][i].item()] for i in range(40)] # for batch_index in range(2)] else: raise Exception("Caption model not supported: {}".format( cfg.caption_model)) optimizer.zero_grad() # if cfg.caption_model == 'tree_model_md_att': # logprobs = model(word_idx, father_idx, fc_feats, att_feats) # loss = crit(logprobs, word_idx, masks) if cfg.caption_model == 'tree_model_md' or cfg.caption_model == 'tree_model_md_sob' \ or cfg.caption_model == 'tree_model_md_in' or cfg.caption_model == 'drnn' \ or cfg.caption_model == 'tree_model_md_att': word_idx, father_idx, mask, seqLogprobs = model._sample( fc_feats, att_feats, max_seq_length=40) gen_result = utils.decode_sequence(vocab, word_idx, father_idx, mask) ratio = utils.seq2ratio(word_idx, father_idx, mask) reward = get_self_critical_reward(model, fc_feats, att_feats, data, gen_result, vocab, cocoid2caps, word_idx.size(1), cfg) loss = rl_crit(seqLogprobs, mask, torch.from_numpy(reward).float().cuda(), ratio) else: raise Exception("Caption model not supported: {}".format( cfg.caption_model)) loss.backward() utils.clip_gradient(optimizer, cfg.grad_clip) optimizer.step() train_loss = loss.item() if cfg.use_cuda: torch.cuda.synchronize() if iteration % cfg.losses_log_every == 0: end = time.time() logger.scalar_summary('train_loss', train_loss, iteration) logger.scalar_summary('learning_rate', cfg.learning_rate, iteration) loss_history[iteration] = train_loss lr_history[iteration] = cfg.current_lr print( "iter {} (epoch {}), learning_rate: {:.6f}, train_loss: {:.6f}, current_cider: {:.3f}, best_cider: {:.3f}, time/log = {:.3f}" \ .format(iteration, epoch, cfg.current_lr, train_loss, current_score, best_val_score, end - start)) start = time.time() if (iteration + 1) % cfg.save_checkpoint_every == 0: eval_kwargs = {'eval_split': 'val', 'eval_time': False} eval_kwargs.update(vars(cfg)) # lang_stats = eval_utils.eval_split(model, vocab, eval_kwargs) lang_stats = eval_seqtree.eval_split(model, vocab, eval_kwargs) if cfg.use_cuda: model = model.cuda() for k, v in lang_stats.items(): logger.scalar_summary(k, v, iteration) val_result_history[iteration] = {'lang_stats': lang_stats} current_score = lang_stats['CIDEr'] best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True num_period_best = 1 else: num_period_best += 1 if best_flag: infos['iter'] = iteration infos['epoch'] = epoch infos['val_result_history'] = val_result_history infos['loss_history'] = loss_history infos['lr_history'] = lr_history checkpoint_path = os.path.join( logdir, 'model_' + cfg.id + '_best.pth') torch.save(model.state_dict(), checkpoint_path) optimizer_path = os.path.join( logdir, 'optimizer_' + cfg.id + '_best.pth') torch.save(optimizer.state_dict(), optimizer_path) print("model saved to {}".format(logdir)) with open( os.path.join(logdir, 'infos_' + cfg.id + '_best.pkl'), 'wb') as f: pickle.dump(infos, f) if num_period_best >= cfg.num_eval_no_improve: print('no improvement, exit({})'.format(best_val_score)) sys.exit() iteration += 1 epoch += 1 if epoch >= cfg.max_epoches != -1: break
def __init__(self, opt): lopt = opt.logger topt = opt.trainer mopt = opt.model gopt = opt.model.gen copt = opt.model.crit goopt = opt.optim.gen coopt = opt.optim.crit #CUDA configuration if opt.device == 'cuda' and torch.cuda.is_available(): os.environ['CUDA_VISIBLE_DEVICES'] = opt.deviceId torch.backends.cudnn.benchmark = True else: opt.device = 'cpu' self.device = torch.device(opt.device) #logger self.logger_ = Logger(self, gopt.latentSize, topt.resumeTraining, opt.tick, opt.loops, lopt.logPath, lopt.logStep, lopt.saveImageEvery, lopt.saveModelEvery, lopt.logLevel, self.device) self.logger = self.logger_.logger #Logging configuration parameters if opt.device == 'cuda': num_gpus = len(opt.deviceId.split(',')) self.logger.info("Using {} GPUs.".format(num_gpus)) self.logger.info("Training on {}.\n".format( torch.cuda.get_device_name(0))) #data loader dlopt = opt.dataLoader self.dataLoader = DataLoader(dlopt.dataPath, dlopt.resolution, dlopt.noChannels, dlopt.batchSize, dlopt.numWorkers) self.resolution, self.nCh = self.dataLoader.resolution, self.dataLoader.nCh # training opt assert opt.tick > 0, self.logger.error( f'The number of ticks should be a positive integer, got {opt.tick} instead' ) self.tick = float(opt.tick) assert opt.loops > 0, self.logger.error( f'The number of ticks should be a positive integer, got {opt.loops} instead' ) self.loops = int(opt.loops) self.imShown = 0 self.batchShown = self.imShown // self.dataLoader.batchSize assert topt.lossFunc in ['NSL', 'WD'], self.logger.error( f'The specified loss model is not supported. Please choose between "NSL" or "WD"' ) self.lossFunc = topt.lossFunc self.criterion = NonSaturatingLoss if self.lossFunc == 'NSL' else WassersteinLoss self.applyLossScaling = bool(topt.applyLossScaling) self.paterm = topt.paterm self.lambg = float(topt.lambg) self.gLazyReg = max(topt.gLazyReg, 1) self.styleMixingProb = float(topt.styleMixingProb) self.meanPathLength = 0. self.plDecay = topt.meanPathLengthDecay self.pathRegWeight = topt.pathLengthRWeight assert topt.nCritPerGen > 0, self.logger.error( f'Trainer ERROR: The number of critic training loops per generator loop should be an integer >= 1 (got {topt.nCritPerGen})' ) self.nCritPerGen = int(topt.nCritPerGen) self.lambR2 = float(topt.lambR2) if topt.lambR2 else 0 #lambda R2 self.obj = float(topt.obj) if topt.obj else 1 #objective value (1-GP) self.lambR1 = float(topt.lambR1) if topt.lambR2 else 0 #lambda R1 self.epsilon = float( topt.epsilon) if topt.epsilon else 0 #epsilon (drift loss) self.cLazyReg = max(topt.cLazyReg, 1) self.kUnroll = int(topt.unrollCritic) if topt.unrollCritic else 0 assert self.kUnroll >= 0, self.logger.error( f'Trainer ERROR: The unroll parameter is less than zero ({self.kUnroll})' ) #Common model parameters common = { 'fmapMax': mopt.fmapMax, 'fmapMin': mopt.fmapMin, 'fmapDecay': mopt.fmapDecay, 'fmapBase': mopt.fmapBase, 'activation': mopt.activation, 'upsample': mopt.sampleMode, 'downsample': mopt.sampleMode } #Generator model parameters self.gen = Generator(**common, **gopt).to(self.device) self.latentSize = self.gen.mapping.latentSize self.logger.info( f'Generator constructed. Number of parameters {sum([np.prod([*p.size()]) for p in self.gen.parameters()])}' ) #Critic model parameters self.crit = Critic(**mopt, **copt).to(self.device) self.logger.info( f'Critic constructed. Number of parameters {sum([np.prod([*p.size()]) for p in self.crit.parameters()])}' ) #Generator optimizer parameters glr, beta1, beta2, epsilon, lrDecay, lrDecayEvery, lrWDecay = list( goopt.values()) assert lrDecay >= 0 and lrDecay <= 1, self.logger.error( 'Trainer ERROR: The decay constant for the learning rate of the generator must be a constant between [0, 1]' ) assert lrWDecay >= 0 and lrWDecay <= 1, self.logger.error( 'Trainer ERROR: The weight decay constant for the generator must be a constant between [0, 1]' ) self.gOptimizer = Adam(filter(lambda p: p.requires_grad, self.gen.parameters()), lr=glr, betas=(beta1, beta2), weight_decay=lrWDecay, eps=epsilon) if lrDecayEvery and lrDecay: self.glrScheduler = lr_scheduler.StepLR(self.gOptimizer, step_size=lrDecayEvery * self.tick, gamma=lrDecay) else: self.glrScheduler = None self.logger.info(f'Generator optimizer constructed') #Critic optimizer parameters clr, beta1, beta2, epsilon, lrDecay, lrDecayEvery, lrWDecay = list( coopt.values()) assert lrDecay >= 0 and lrDecay <= 1, self.logger.error( 'Trainer ERROR: The decay constant for the learning rate of the critic must be a constant between [0, 1]' ) assert lrWDecay >= 0 and lrWDecay <= 1, self.logger.error( 'Trainer ERROR: The weight decay constant for the critic must be a constant between [0, 1]' ) self.cOptimizer = Adam(filter(lambda p: p.requires_grad, self.crit.parameters()), lr=clr, betas=(beta1, beta2), weight_decay=lrWDecay, eps=epsilon) if lrDecayEvery and lrDecay: self.clrScheduler = lr_scheduler.StepLR(self.gOptimizer, step_size=lrDecayEvery * self.tick, gamma=lrDecay) else: self.clrScheduler = None self.logger.info(f'Critic optimizer constructed') self.preWtsFile = opt.preWtsFile self.resumeTraining = bool(topt.resumeTraining) self.loadPretrainedWts(resumeTraining=self.resumeTraining) self.logger.info( f'The trainer has been instantiated.... Starting step: {self.imShown}. Resolution: {self.resolution}' ) self.logArchitecture(clr, glr)
class Trainer: """ Trainer class with hyperparams, log, train function etc. """ def __init__(self, opt): lopt = opt.logger topt = opt.trainer mopt = opt.model gopt = opt.model.gen copt = opt.model.crit goopt = opt.optim.gen coopt = opt.optim.crit #CUDA configuration if opt.device == 'cuda' and torch.cuda.is_available(): os.environ['CUDA_VISIBLE_DEVICES'] = opt.deviceId torch.backends.cudnn.benchmark = True else: opt.device = 'cpu' self.device = torch.device(opt.device) #logger self.logger_ = Logger(self, gopt.latentSize, topt.resumeTraining, opt.tick, opt.loops, lopt.logPath, lopt.logStep, lopt.saveImageEvery, lopt.saveModelEvery, lopt.logLevel, self.device) self.logger = self.logger_.logger #Logging configuration parameters if opt.device == 'cuda': num_gpus = len(opt.deviceId.split(',')) self.logger.info("Using {} GPUs.".format(num_gpus)) self.logger.info("Training on {}.\n".format( torch.cuda.get_device_name(0))) #data loader dlopt = opt.dataLoader self.dataLoader = DataLoader(dlopt.dataPath, dlopt.resolution, dlopt.noChannels, dlopt.batchSize, dlopt.numWorkers) self.resolution, self.nCh = self.dataLoader.resolution, self.dataLoader.nCh # training opt assert opt.tick > 0, self.logger.error( f'The number of ticks should be a positive integer, got {opt.tick} instead' ) self.tick = float(opt.tick) assert opt.loops > 0, self.logger.error( f'The number of ticks should be a positive integer, got {opt.loops} instead' ) self.loops = int(opt.loops) self.imShown = 0 self.batchShown = self.imShown // self.dataLoader.batchSize assert topt.lossFunc in ['NSL', 'WD'], self.logger.error( f'The specified loss model is not supported. Please choose between "NSL" or "WD"' ) self.lossFunc = topt.lossFunc self.criterion = NonSaturatingLoss if self.lossFunc == 'NSL' else WassersteinLoss self.applyLossScaling = bool(topt.applyLossScaling) self.paterm = topt.paterm self.lambg = float(topt.lambg) self.gLazyReg = max(topt.gLazyReg, 1) self.styleMixingProb = float(topt.styleMixingProb) self.meanPathLength = 0. self.plDecay = topt.meanPathLengthDecay self.pathRegWeight = topt.pathLengthRWeight assert topt.nCritPerGen > 0, self.logger.error( f'Trainer ERROR: The number of critic training loops per generator loop should be an integer >= 1 (got {topt.nCritPerGen})' ) self.nCritPerGen = int(topt.nCritPerGen) self.lambR2 = float(topt.lambR2) if topt.lambR2 else 0 #lambda R2 self.obj = float(topt.obj) if topt.obj else 1 #objective value (1-GP) self.lambR1 = float(topt.lambR1) if topt.lambR2 else 0 #lambda R1 self.epsilon = float( topt.epsilon) if topt.epsilon else 0 #epsilon (drift loss) self.cLazyReg = max(topt.cLazyReg, 1) self.kUnroll = int(topt.unrollCritic) if topt.unrollCritic else 0 assert self.kUnroll >= 0, self.logger.error( f'Trainer ERROR: The unroll parameter is less than zero ({self.kUnroll})' ) #Common model parameters common = { 'fmapMax': mopt.fmapMax, 'fmapMin': mopt.fmapMin, 'fmapDecay': mopt.fmapDecay, 'fmapBase': mopt.fmapBase, 'activation': mopt.activation, 'upsample': mopt.sampleMode, 'downsample': mopt.sampleMode } #Generator model parameters self.gen = Generator(**common, **gopt).to(self.device) self.latentSize = self.gen.mapping.latentSize self.logger.info( f'Generator constructed. Number of parameters {sum([np.prod([*p.size()]) for p in self.gen.parameters()])}' ) #Critic model parameters self.crit = Critic(**mopt, **copt).to(self.device) self.logger.info( f'Critic constructed. Number of parameters {sum([np.prod([*p.size()]) for p in self.crit.parameters()])}' ) #Generator optimizer parameters glr, beta1, beta2, epsilon, lrDecay, lrDecayEvery, lrWDecay = list( goopt.values()) assert lrDecay >= 0 and lrDecay <= 1, self.logger.error( 'Trainer ERROR: The decay constant for the learning rate of the generator must be a constant between [0, 1]' ) assert lrWDecay >= 0 and lrWDecay <= 1, self.logger.error( 'Trainer ERROR: The weight decay constant for the generator must be a constant between [0, 1]' ) self.gOptimizer = Adam(filter(lambda p: p.requires_grad, self.gen.parameters()), lr=glr, betas=(beta1, beta2), weight_decay=lrWDecay, eps=epsilon) if lrDecayEvery and lrDecay: self.glrScheduler = lr_scheduler.StepLR(self.gOptimizer, step_size=lrDecayEvery * self.tick, gamma=lrDecay) else: self.glrScheduler = None self.logger.info(f'Generator optimizer constructed') #Critic optimizer parameters clr, beta1, beta2, epsilon, lrDecay, lrDecayEvery, lrWDecay = list( coopt.values()) assert lrDecay >= 0 and lrDecay <= 1, self.logger.error( 'Trainer ERROR: The decay constant for the learning rate of the critic must be a constant between [0, 1]' ) assert lrWDecay >= 0 and lrWDecay <= 1, self.logger.error( 'Trainer ERROR: The weight decay constant for the critic must be a constant between [0, 1]' ) self.cOptimizer = Adam(filter(lambda p: p.requires_grad, self.crit.parameters()), lr=clr, betas=(beta1, beta2), weight_decay=lrWDecay, eps=epsilon) if lrDecayEvery and lrDecay: self.clrScheduler = lr_scheduler.StepLR(self.gOptimizer, step_size=lrDecayEvery * self.tick, gamma=lrDecay) else: self.clrScheduler = None self.logger.info(f'Critic optimizer constructed') self.preWtsFile = opt.preWtsFile self.resumeTraining = bool(topt.resumeTraining) self.loadPretrainedWts(resumeTraining=self.resumeTraining) self.logger.info( f'The trainer has been instantiated.... Starting step: {self.imShown}. Resolution: {self.resolution}' ) self.logArchitecture(clr, glr) def logArchitecture(self, clr, glr): """ This function will print hyperparameters and architecture and save the in the log directory under the architecture.txt file """ cstFcn = f'Cost function model: {self.lossFunc}\n' hyperParams = ( f'HYPERPARAMETERS - res = {self.resolution}|bs = {self.dataLoader.batchSize}|cLR = {clr}|gLR = {glr}|lambdaR2 = {self.lambR2}|' f'obj = {self.obj}|lambdaR1 = {self.lambR1}|epsilon = {self.epsilon}|{self.loops} loops, showing {self.tick} images per loop' f'|Using pulling away regularization? {"Yes" if self.paterm else "No"}' ) architecture = '\n' + str(self.crit) + '\n\n' + str(self.gen) + '\n\n' self.logger.info(cstFcn + hyperParams) f = os.path.join(self.logger_.logPath, self.logger_.archFile) self.logger.debug(architecture) utils.writeFile(f, cstFcn + hyperParams + architecture, 'w') def loadPretrainedWts(self, resumeTraining=False): """ Search for weight file in the experiment directory, and loads it if found """ dir = self.preWtsFile if os.path.isfile(dir): try: stateDict = torch.load( dir, map_location=lambda storage, loc: storage) self.crit.load_state_dict(stateDict['crit']) self.gen.load_state_dict( stateDict['gen'], strict=False ) #Since the cached noise buffers are initialized at None self.logger.debug(f'Loaded pre-trained weights from {dir}') if resumeTraining: self.imShown = stateDict['imShown'] self.loops = stateDict['loops'] self.tick = stateDict['tick'] self.logger_.genLoss = stateDict['genLoss'] self.logger_.criticLoss = stateDict['criticLoss'] self.logger_.criticLossReals = stateDict['criticLossReals'] self.logger_.criticLossFakes = stateDict['criticLossFakes'] self.logger_.logCounter = stateDict['logCounter'] self.logger_.ncAppended = stateDict['ncAppended'] self.logger_.ngAppended = stateDict['ngAppended'] self.logger_.snapCounter = stateDict['snapCounter'] self.logger_.imgCounter = stateDict['imgCounter'] self.cOptimizer.load_state_dict(stateDict['cOptimizer']) self.gOptimizer.load_state_dict(stateDict['gOptimizer']) self.clrScheduler.load_state_dict( stateDict['clrScheduler']) self.glrScheduler.load_state_dict( stateDict['glrScheduler']) self.batchShown = stateDict['batchShown'] self.meanPathLength = stateDict['meanPathLength'] self.logger.debug(f'And the optimizers states as well') return True except Exception as e: self.logger.error( f'ERROR: The weights in {dir} could not be loaded\n {str(e)}\n Proceding from zero...' ) return False else: self.logger.error( f'ERROR: The file {dir} does not exist. Proceding from zero...' ) return False def getReals(self, n=None): """ Returns n real images """ return self.dataLoader.get(n).to(device=self.device) def getFakes(self, n=None, z=None): """ Returns n fake images and their latent vectors """ if n is None: n = self.dataLoader.batchSize if z is None: z = utils.getNoise(bs=n, latentSize=self.latentSize, device=self.device) if self.styleMixingProb and random() < self.styleMixingProb: zmix = utils.getNoise(bs=n, latentSize=self.latentSize, device=self.device) zmix = (zmix - zmix.mean(dim=1, keepdim=True)) / ( zmix.std(dim=1, keepdim=True) + 1e-8) output = self.gen(z, zmix=zmix) else: output = self.gen(z) else: output = self.gen(z) if isinstance(output, list): return [*output, z] else: return [output, z] def getBatchReals(self): """ Returns a batch of real images """ return self.dataLoader.get_batch().to(device=self.device) def getBatchFakes(self): """ Returns a batch of fake images and the latent vector which generated it """ return self.getFakes() def R2GradientPenalization(self, reals, fakes): alpha = torch.rand(reals.size(0), 1, 1, 1, device=reals.device) interpols = (alpha * reals + (1 - alpha) * fakes).detach().requires_grad_(True) cOut = self.crit(interpols).sum() if self.applyLossScaling: cOut = applyLossScaling(cOut) ddx = autograd.grad(outputs=cOut, inputs=interpols, grad_outputs=torch.ones_like(cOut, device=self.device), create_graph=True, retain_graph=True, only_inputs=True)[0] ddx = ddx.view(ddx.size(0), -1) if self.applyLossScaling: ddx = undoLossScaling(ddx) return ( (ddx.norm(dim=1) - self.obj).pow(2)).mean() / (self.obj + 1e-8)**2 def R1GradientPenalization(self, reals): reals.requires_grad_(True) cOut = self.crit(reals).sum() if self.applyLossScaling: cOut = applyLossScaling(cOut) ddx = autograd.grad(outputs=cOut, inputs=reals, grad_outputs=torch.ones_like(cOut, device=self.device), create_graph=True, retain_graph=True, only_inputs=True)[0] ddx = ddx.view(ddx.size(0), -1) if self.applyLossScaling: ddx = undoLossScaling(ddx) return 0.5 * (ddx.pow(2).sum(dim=1)).mean() def GradientPathRegularization(self, fakes, latents): noise = torch.randn_like(fakes) / math.sqrt( fakes.size(2) * fakes.size(3)) ddx = autograd.grad(outputs=(fakes * noise).sum(), inputs=latents, create_graph=True)[0] pathLengths = ddx.norm(dim=1) if self.meanPathLength == 0: self.meanPathLength = pathLengths.mean() else: self.meanPathLength = self.meanPathLength + self.plDecay * ( pathLengths.mean() - self.meanPathLength) self.meanPathLength = self.meanPathLength.detach() return (pathLengths - self.meanPathLength).pow(2).mean() def trainCritic(self): """ Train the critic for one step and store outputs in logger """ utils.switchTrainable(self.crit, True) utils.switchTrainable(self.gen, False) # real real = self.dataLoader.get_batch().to(self.device) cRealOut = self.crit(x=real) # fake fake, *_ = self.getBatchFakes() cFakeOut = self.crit(x=fake.detach()) lossReals = self.criterion(cRealOut, truth=1) lossFakes = self.criterion(cFakeOut, truth=-1) loss = lossReals + lossFakes if self.batchShown % self.cLazyReg == self.cLazyReg - 1: if self.lambR2: loss += self.cLazyReg * self.lambR2 * self.R2GradientPenalization( real, fake) if self.epsilon: loss += self.epsilon * (cRealOut**2).mean() if self.lambR1: loss += self.lambR1 * self.R1GradientPenalization(real) self.cOptimizer.zero_grad() loss.backward() self.cOptimizer.step() if self.clrScheduler is not None: self.clrScheduler.step() #Reduce learning rate self.logger_.appendCLoss(loss, lossReals, lossFakes) def trainGenerator(self): """ Train Generator for 1 step and store outputs in logger """ utils.switchTrainable(self.gen, True) utils.switchTrainable(self.crit, False) fake, *latents = self.getBatchFakes() cFakeOut = self.crit(fake) loss = self.criterion(cFakeOut, truth=1) if self.batchShown % self.gLazyReg == self.gLazyReg - 1: if self.pathRegWeight > 0: dlatent = latents[0] loss += self.GradientPathRegularization( fake, dlatent) * self.gLazyReg * self.pathRegWeight if self.lambg > 0 and self.paterm: latent = latents[-1] pat = self.gen.paTerm(latent) * self.lambg * self.gLazyReg loss += pat self.gOptimizer.zero_grad() loss.backward() self.gOptimizer.step() if self.glrScheduler is not None: self.glrScheduler.step() #Reduce learning rate self.logger_.appendGLoss(loss) return fake.size(0) def train(self): """ Main train loop """ self.logger.info('Starting training...') self.logger_.startLogging() #Start the logger # loop over images while self.imShown < self.tick * self.loops: if self.kUnroll: for i in range(self.nCritPerGen): self.trainCritic() if i == 0: self.cBackup = copy.deepcopy(self.crit) else: for i in range(self.nCritPerGen): self.trainCritic() shown = self.trainGenerator( ) #Use the generator training batches to count for the images shown, not the critic if self.kUnroll: self.crit.load(self.cBackup) self.imShown = self.imShown + int(shown) self.batchShown = self.batchShown + 1 if self.batchShown > max(self.gLazyReg, self.cLazyReg): self.batchShown = 0 self.logger_.saveSnapshot( f'{self.resolution}x{self.resolution}_final_{self.latentSize}')
parser.add_argument( '--config', type=str, default= '/home/rudra/Downloads/rudra/relationship_modeling/o2p2/physics_engine/configs/pre-planning.yml', help='Path to config file') opts = parser.parse_args() params = get_config(opts.config) pp = pprint.PrettyPrinter(indent=2) pp.pprint(params) # Define models and dataloaders train_loader, val_loader = initial_final_dataloader(params) model = O2P2Model(params) if params['use_cuda']: model = model.cuda() exp_results_path = params['project_root'] + '/results/' + params[ 'exp_name'] + '_' + timestamp + '/' exp_logs_path = params['project_root'] + '/logs/' + params[ 'exp_name'] + '_' + timestamp + '/' mkdir_p(exp_logs_path) mkdir_p(exp_results_path) logger = Logger(exp_logs_path) trainer = O2P2Trainer(params, model, train_loader, val_loader, logger, exp_results_path, exp_logs_path) trainer.train()
class FusionParser(object): """Get and parse results from previously run programs (fusion prediction, hla typing, expression estimation)""" # Initialization of parameters # urla: todo: pylint convention - too many arguments # are all vars required to be class vars? def __init__(self, scratch_path, fusion_output_path, sample_id, tool_num_cutoff, fusiontool_list, sample_log): """Parameter initiation and work folder creation.""" self.scratch_path = scratch_path self.fusion_output_path = fusion_output_path self.sample_id = sample_id self.tool_num_cutoff = int(tool_num_cutoff) # urla: if we want to be more generic and allow different annotations, identification of the chr names # (eg "chr1" vs "1" and "chrM" vs "MT") should be performed in advance self.chr_list = ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT") self.tools = fusiontool_list.split(",") self.logger = Logger(sample_log) ##### ### fusion tool parser # # fusioncatcher - results file is "summary_candidate_fusions.txt" def get_fusioncatcher_results(self): """Load and parse results from fusioncatcher""" fusioncatcher_predict_summary = os.path.join( self.scratch_path, "fusion", "fusioncatcher", "summary_candidate_fusions.txt") fusioncatcher_predict_detail = os.path.join( self.scratch_path, "fusion", "fusioncatcher", "final-list_candidate-fusion-genes.txt") reciprocal_fusions = [] with open(fusioncatcher_predict_summary) as predict_summary: for line in predict_summary: if line.strip().startswith("*"): # urla: todo: pylint warning - anomalous backslash # I don't really understand the problem and the RE is simple and working very fine fusion_gene = re.search(r'\*\s([\S]*)', line).group(1) if "reciprocal" in line: reciprocal_fusions.append( fusion_gene.replace("--", "_").upper()) fusion_map = {} with open(fusioncatcher_predict_detail) as prediction: next(prediction) # skip header line for line in prediction: elements = line.rstrip().split("\t") # Currently relevant fields (marked *) in the output file are: # * elements[0] ~ Gene_1_symbol(5end_fusion_partner) # * elements[1] ~ Gene_2_symbol(3end_fusion_partner) # elements[2] ~ Fusion_description # elements[3] ~ Counts_of_common_mapping_reads # * elements[4] ~ Spanning_pairs # * elements[5] ~ Spanning_unique_reads # elements[6] ~ Longest_anchor_found # elements[7] ~ Fusion_finding_method # * elements[8] ~ Fusion_point_for_gene_1(5end_fusion_partner) # * elements[9] ~ Fusion_point_for_gene_2(3end_fusion_partner) # elements[10] ~ Gene_1_id(5end_fusion_partner) # elements[11] ~ Gene_2_id(3end_fusion_partner) # elements[12] ~ Exon_1_id(5end_fusion_partner) # elements[13] ~ Exon_2_id(3end_fusion_partner) # elements[14] ~ Fusion_sequence # elements[15] ~ Predicted_effect fusion_gene = (elements[0] + "_" + elements[1]).upper() # if the fusion gene is reciprocal, the fusion id is reversed? <- what for?? if fusion_gene in reciprocal_fusions: fusion_gene = (elements[1] + "_" + elements[0]).upper() # for key in self.sub_dict: # fusion_gene = fusion_gene.replace(key, self.sub_dict[key]) # urla: why not catching "Counts_of_common_mapping_reads" which indicate how similar the fusion partners are? <- FP's; should be 0 for max specificity #common_map_num = elements[3] # urla: according to online manual, junction reads are in elements[5] and all supporting (junction and spanning?) in elements[4]?! # urla_c: I changed it here # skip all prediction not on standard chromosomes if elements[8].split(":")[0] not in self.chr_list or elements[ 9].split(":")[0] not in self.chr_list: continue fgid = fusion_gene.split("_")[0] + "_" + elements[ 8] + "_" + fusion_gene.split("_")[1] + "_" + elements[9] fusion_map[fgid] = [ fusion_gene, # fusion_gene elements[8], # up_gene_bp elements[9], # dn_gene_bp elements[ 5], # junc_reads_num - urla: todo: verify that this is correct elements[ 4], # span_reads_num - urla: todo: verify that this is correct self.sample_id, "Fusioncatcher" ] return fusion_map # starfusion - results file is "star-fusion.fusion_candidates.final (not any more, now: star-fusion.fusion_predictions.abridged.tsv)" # in order to be compatible with old and new starfusion versions, the new file name is checked first, if not available, the old one is used. # urla - note: this is rather a hack then a proper solution, but proper version handling would actually require to dig through all versions # and look for changes in file names and output columns and maybe additional stuff... this would properly require a tremendous # amount of time and is therefore out of scope of this project def get_starfusion_results(self): """Load and parse results from star-fusion""" starfusion_predict = os.path.join( self.scratch_path, "fusion", "starfusion", "star-fusion.fusion_predictions.abridged.tsv") if not os.path.isfile(starfusion_predict): starfusion_predict = os.path.join( self.scratch_path, "fusion", "starfusion", "star-fusion.fusion_candidates.final") fusion_map = {} with open(starfusion_predict, "r") as prediction: next(prediction) # skip header line for line in prediction: elements = line.rstrip().split("\t") # Currently relevant fields (marked *) in the output file are: # * elements[0] ~ FusionName # * elements[1] ~ JunctionReadCount # * elements[2] ~ SpanningFragCount # elements[3] ~ SpliceType # elements[4] ~ LeftGene # * elements[5] ~ LeftBreakpoint # elements[6] ~ RightGene # * elements[7] ~ RightBreakpoint # elements[8] ~ LargeAnchorSupport # elements[9] ~ FFPM # elements[10] ~ LeftBreakDinuc # elements[11] ~ LeftBreakEntropy # elements[12] ~ RightBreakDinuc # elements[13] ~ RightBreakEntropy # elements[14] ~ annots fusion_gene = elements[0].replace("--", "_").upper() # check whether fusion gene is not on primary chr if elements[5].split(":")[0] not in self.chr_list or elements[ 7].split(":")[0] not in self.chr_list: continue fgid = fusion_gene.split("_")[0] + "_" + elements[ 5] + "_" + fusion_gene.split("_")[1] + "_" + elements[7] #self.logger.debug("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(fgid, fusion_gene, elements[5], elements[7], elements[1], elements[2], self.sample_id)) fusion_map[fgid] = [ fusion_gene, # fusion_gene elements[5], # up_gene_bp elements[7], # dn_gene_bp elements[1], # junc_reads_num elements[2], # span_reads_num self.sample_id, "Starfusion" ] return fusion_map # mapsplice2 - results file is "fusions_well_annotated.txt" def get_mapsplice_results(self): """Load and parse results from mapsplice""" mapsplice_predict = os.path.join(self.scratch_path, "fusion", "mapsplice", "fusions_well_annotated.txt") fusion_map = {} with open(mapsplice_predict) as prediction: # next(prediction) # mapsplice final result table, doesn't have a header! for line in prediction: elements = line.rstrip().split("\t") # Currently relevant fields (marked *) in the output file are: # * elements[0] ~ chrom: the two chromosomes involved in fusion junction # * elements[1] ~ doner_end: The end position of doner site of splicing on chromosome # * elements[2] ~ acceptor_start: The start position of acceptor site of splicing on chromosome # elements[3] ~ id: The id of fusion junction # * elements[4] ~ coverage: number of reads aligned to the fusion junction # * elements[5] ~ strand: strand of the reads mapped to the two chromosomes # elements[6] ~ rgb: An RGB value of the form R,G,B # elements[7] ~ block_count: The number of blocks in the BED line # elements[8] ~ block_size: A comma-separated list of the block sizes. # elements[9] ~ block_distance: A comma-separated list of block distance. # elements[10] ~ entropy: entropy of the fusion junction. # elements[11] ~ flank_case: non-zero for canonical and semi-canonical junctions (ATAC 1;GTAT 2;CTGC 3;GCAG 4;GTAG 5;CTAC 6;others 0) # elements[12] ~ flank_string: the two basepairs after doner site combined the two basepairs before acceptor site # elements[13] ~ min_mismatch: Minimal mismatch of read mapped to the fusion junction # elements[14] ~ max_mismatch: Maximal mismatch of read mapped to the fusion junction # elements[15] ~ ave_mismatch: Average mismatch of all reads mapped to the junction # elements[16] ~ max_min_suffix: if doner site is shorter than acceptor site, and if the doner site is longer than current maximal doner site length, then update current maximal doner site length # elements[17] ~ max_min_prefix: if acceptor site is shorter than doner site, and if the doner site is longer than current maximal acceptor site length, then update current maximal acceptor site length # elements[18] ~ min_anchor_difference: Minimal difference between doner site and acceptor site # * elements[19] ~ unique_read_count: Number of uniquely mapped reads mapped to the fusion # elements[20] ~ multi_read_count: Number of multiple mapped reads mapped to the fusion # elements[21] ~ paired_read_count: Number of reads mapped to fusion and can be paired with their mates near the fusion # elements[22] ~ left_paired_read_count: Number of paired reads that the read itself is mapped to the left of its mate on genome # elements[23] ~ right_paired_read_count: Number of paired reads that the read itself is mapped to the right of its mate on genome # elements[24] ~ multiple_paired_read_count: Number of multiple mapped reads mapped to the fusion and are paired with their mates # elements[25] ~ unique_paired_read_count: Number of uniquely mapped reads mapped to the fusion and are paired with their mates # elements[26] ~ single_read_count: Number of reads mapped to the fusion but can't be paired with their mates # elements[27] ~ encompassing_read pair_count: Number of reads pairs surround the fusion(but not cross the fusion) # elements[28] ~ doner_start: The start of doner site of splicing on chromosome # elements[29] ~ acceptor_end: The end of acceptor site of splicing on chromosome # elements[30] ~ doner_iosforms: The isoform(transcript) structure on the doner site. each isoform structure is separated by '|'. The format of each isoform structure is the "start_of_the_isoform,CIGAR_string_of_structure". E.g. 59445681,180M12006N66M8046N47M|59445681,180M20118N47M| Two isoforms start at 59445681 # elements[31] ~ acceptor_isoforms: The isoform(transcript) structure on the acceptor site. # elements[32] ~ doner uniformity score (obsolete): The p-value of T-test against the hypothesis that the start of spanning read pairs and encompassing read pairs distribute uniformly on the doner site # elements[33] ~ acceptor uniformity score (obsolete): The p-value of Kolmogorov-Smirnov test against the hypothesis that the end of spanning read pairs and encompassing read pairs distribute uniformly on the acceptor site # elements[34] ~ doner uniformity KS-test score (obsolete): The score of Kolmogorov-Smirnov test against the hypothesis that the start of spanning read pairs and encompassing read pairs distribute uniformly on the doner site # elements[35] ~ acceptor uniformity KS-test score (obsolete): The score of Kolmogorov-Smirnov test against the hypothesis that the end of spanning read pairs and encompassing read pairs distribute uniformly on the acceptor site # elements[36] ~ minimal_doner_isoform_length: Minimal length of isoform structure on the doner site # elements[37] ~ maximal_doner_isoform_length: Maximal length of isoform structure on the doner site # elements[38] ~ minimal_acceptor_isoform_length: Minimal length of isoform structure on the acceptor site # elements[39] ~ maximal_acceptor_isoform_length: Maximal length of isoform structure on the acceptor site # elements[40] ~ paired_reads_entropy: entropy of different read pairs # elements[41] ~ mismatch_per_bp: Average mismatch per base. # elements[42] ~ anchor_score: Anchor score. # elements[43] ~ max_doner_fragment: Maximum doner fragment size. # elements[44] ~ max_acceptor_fragment: Maximum acceptor fragment size. # elements[45] ~ max_cur_fragment: Maximum total fragment length of doner and acceptor. # elements[46] ~ min_cur_fragment: Minimum total fragment length of doner and acceptor # elements[47] ~ ave_cur_fragment: Average total fragment length of doner and acceptor. # elements[48] ~ doner_encompass_unique: Number of uniquely mapped reads surround the donor site of the fusion # elements[49] ~ doner_encompass_multiple: Number of multiple mapped reads surround the donor site of the fusion # elements[50] ~ acceptor_encompass_unique: Number of uniquely mapped reads surround the acceptor site of the fusion # elements[51] ~ acceptor_encompass_multiple: Number of multiple mapped reads surround the acceptor site of the fusion # elements[52] ~ doner_match_to_normal: If the fusion doner site is matched to a normal splice junction. 1 is matched, 0 is not matched # elements[53] ~ acceptor_match_to_normal: If the fusion doner site is matched to a normal splice junction. 1 is matched, 0 is not matched # elements[54] ~ doner_seq: The 25bp sequence at doner site matched to the fusion reads, doner end base included. # if doner strand is +, it is chrom1[donerEnd-24:donerEnd] # if doner strand is -, it is revcomp(chrom1[donerEnd:donerEnd+24])) # elements[55] ~ acceptor_seq: The 25bp sequence at acceptor site matched to the fusion reads, acceptor start base included. # if acceptor strand is +, it is chrom2[acceptorStart:acceptorStart+24] # if acceptor strand is -, it is revcomp(chrom2[acceptorStart-24:acceptorStart]) # Note: due to our internal use purposes, the acceptor_seq is always reverse complemented again in MapSplice fusion junction file, which makes it effectively as the following. You can do a reverse completement to acceptor_seq to make the sequence exactly the same as it is mentioned above: # if acceptor strand is +, it is revcom(chrom2[acceptorStart:acceptorStart+24]) # if acceptor strand is -, it is chrom2[acceptorStart-24:acceptorStart] # elements[56] ~ match_gene_strand (only if --gene-gtf specified): If the fusion strand matched with the annotated gene strand. 1 is matched, 0 is not matched # elements[57] ~ annotated_type (only if --gene-gtf specified): The source of fusion # from_fusion: The fusion is from fusion alignments # from_normal: The fusion is from normal alignments, which is normal junction cross two genes(read through fusions) # elements[58] ~ fusion_type (only if --gene-gtf specified): The type of fusion based on the annotated gene # fusion: The start and end of the fusion is annotated to two distinct genes # normal: The start and end of the fusion is annotated to same gene(Circular RNAs) # intergenic: Either the start or end has no gene annotated # overlapped? # elements[59] ~ gene_strand (only if --gene-gtf specified): The annotated genes strands, if there are # * elements[60] ~ annotated_gene_donor (only if --gene-gtf specified): The name of the gene annotated to the doner site of the fusion # * elements[61] ~ annotated_gene_acceptor (only if --gene-gtf specified): The name of the gene annotated to the acceptor site of the fusion fusion_gene = (elements[60].split(",")[0] + "_" + elements[61].split(",")[0]).upper() # element[0] = chr num, [1/2] = breakpoints, [5] = strands up_gene_id = elements[0].split( "~")[0] + ":" + elements[1] + ":" + elements[5][0] dn_gene_id = elements[0].split( "~")[1] + ":" + elements[2] + ":" + elements[5][1] if up_gene_id.split( ":")[0] not in self.chr_list or dn_gene_id.split( ":")[0] not in self.chr_list: continue fgid = fusion_gene.split( "_")[0] + "_" + up_gene_id + "_" + fusion_gene.split( "_")[1] + "_" + dn_gene_id fusion_map[fgid] = [ fusion_gene, # fusion_gene up_gene_id, # up_gene_bp dn_gene_id, # dn_gene_bp elements[4], # junc_reads_num elements[27], # span_reads_num self.sample_id, "Mapsplice" ] return fusion_map # starchip - results file is "starchip.summary" def get_starchip_results(self): """Load and parse results from starchip""" starchip_predict = os.path.join(self.scratch_path, "fusion", "starchip", "starchip.summary") fusion_map = {} with open(starchip_predict, "r") as prediction: next(prediction) # skip header line for line in prediction: elements = line.rstrip().split("\t") # Currently relevant fields (marked *) in the output file are: # * elements[0] ~ Partner1 # * elements[1] ~ Partner2 # * elements[2] ~ SpanningReads # * elements[3] ~ SplitReads # elements[4] ~ AvgAS # * elements[5] ~ NearGene1 # elements[6] ~ Distance1 # * elements[7] ~ NearGene2 # elements[8] ~ Distance2 # elements[9] ~ ConsensusSeq fusion_gene = "{0}_{1}".format(elements[5], elements[7]).upper() # check whether fusion gene is not on primary chr if elements[0].split(":")[0] not in self.chr_list or elements[ 1].split(":")[0] not in self.chr_list: continue fgid = elements[5] + "_" + elements[0] + "_" + elements[ 7] + "_" + elements[1] fusion_map[fgid] = [ fusion_gene, # fusion_gene elements[0], # up_gene_bp elements[1], # dn_gene_bp elements[3], # junc_reads_num elements[2], # span_reads_num self.sample_id, "Starchip" ] return fusion_map # infusion - results file is "fusions.detailed.txt" def get_infusion_results(self): """Load and parse results from starchip""" infusion_predict = os.path.join(self.scratch_path, "fusion", "infusion", "fusions.detailed.txt") fusion_map = {} with open(infusion_predict, "r") as prediction: next(prediction) # skip header line for line in prediction: elements = line.rstrip().split("\t") # Currently relevant fields (marked *) in the output file are: # elements[0] ~ id # elements[1] ~ ref1 # elements[2] ~ break_pos1 # elements[3] ~ region1 # elements[4] ~ ref2 # elements[5] ~ break_pos2 # elements[6] ~ region2 # elements[7] ~ num_split # elements[8] ~ num_paired # elements[9] ~ num_split_with_pair # elements[10] ~ num_split_rescued # elements[11] ~ num_uniq_starts # elements[12] ~ pap_rate # elements[13] ~ mean_split_pos # elements[14] ~ split_pos_std # elements[15] ~ homogeneity # elements[16] ~ coverage_context # elements[17] ~ ssp # elements[18] ~ fusion_class # elements[19] ~ break_on_exon # elements[20] ~ feature_1 # elements[21] ~ gene_1 # elements[22] ~ transcript_1 # elements[23] ~ gene_1_strand # elements[24] ~ biotype_1 # elements[25] ~ expression_1 # elements[26] ~ feature_2 # elements[27] ~ gene_2 # elements[28] ~ transcript_2 # elements[29] ~ gene_2_strand # elements[30] ~ biotype_2 # elements[31] ~ expression_2 # elements[32] ~ splice_motif # elements[33] ~ filters fusion_gene = "{0}_{1}".format( self.get_fusion_gene_id_infusion(elements[21], elements[22]), self.get_fusion_gene_id_infusion(elements[27], elements[28])).upper() # element[1/4] = chr num, [2/5] = breakpoints, [23/29] = strands up_gene_id = elements[1] + ":" + elements[2] + ":" + elements[ 23] dn_gene_id = elements[4] + ":" + elements[5] + ":" + elements[ 29] # check whether fusion gene is not on primary chr if elements[1] not in self.chr_list or elements[ 4] not in self.chr_list: continue fgid = fusion_gene.split( "_")[0] + "_" + up_gene_id + "_" + fusion_gene.split( "_")[1] + "_" + dn_gene_id fusion_map[fgid] = [ fusion_gene, # fusion_gene up_gene_id, # up_gene_bp dn_gene_id, # dn_gene_bp elements[7], # junc_reads_num elements[8], # span_reads_num self.sample_id, "Infusion" ] return fusion_map @staticmethod def get_fusion_gene_id_infusion(gene_id, transcript_list_field): """Helper method for infusion data parsing. Returns the most frequently listed fusion id in a list of fusion ids""" # if only 1 gene id is returned as fusion partner, return this if not ";" in gene_id: return gene_id.replace("_", "-") # if 2 or more gene ids are given, take the most frequent from the transcript list gid_dict = {} for gid in transcript_list_field.split(";"): id_test = gid.split(":")[0][:-4] if not id_test in gid_dict: gid_dict[id_test] = 0 gid_dict[id_test] += 1 best_hit = max(gid_dict.iteritems(), key=operator.itemgetter(1))[0] return best_hit.replace("_", "-") # soapfuse - results file is "*.final.Fusion.specific.for.genes" in "final_fusion_genes" # urla - note: it seems like soapfuse is prefixing chromsome ids with "chr" even if eg ensembl data is used which does not have this # I'm replacing this here because this is our recommended dataset... def get_soapfuse_results(self): """Load and parse results from soapfuse""" soapfuse_predict = "" folder_to_scan = os.path.join(self.scratch_path, "fusion", "soapfuse", "final_fusion_genes") for filename in os.listdir(folder_to_scan): folder_path = os.path.join(folder_to_scan, filename) if os.path.isdir(folder_path): for res in os.listdir(folder_path): if res.endswith(".final.Fusion.specific.for.genes"): soapfuse_predict = os.path.join(folder_path, res) if not soapfuse_predict: soapfuse_predict = os.path.join( self.scratch_path, "fusion", "soapfuse", "final_fusion_genes", self.sample_id, self.sample_id + ".final.Fusion.specific.for.genes") fusion_map = {} with open(soapfuse_predict) as prediction: next(prediction) # skip header line for line in prediction: elements = line.rstrip().split("\t") # Currently relevant fields (marked *) in the output file are: # * elements[0] ~ up_gene # * elements[1] ~ up_chr # * elements[2] ~ up_strand # * elements[3] ~ up_Genome_pos # elements[4] ~ up_loc # * elements[5] ~ dw_gene # * elements[6] ~ dw_chr # * elements[7] ~ dw_strand # * elements[8] ~ dw_Genome_pos # elements[9] ~ dw_loc # * elements[10] ~ Span_reads_num # * elements[11] ~ Junc_reads_num # elements[12] ~ Fusion_Type # elements[13] ~ down_fusion_part_frame-shift_or_not fusion_gene = (elements[0] + "_" + elements[5]).upper() # element[1/6] = chr num, [3/8] = breakpoints, [2/7] = strands up_gene_id = elements[1] + ":" + elements[3] + ":" + elements[2] dn_gene_id = elements[6] + ":" + elements[8] + ":" + elements[7] # check whether fusion gene is not on primary chr if elements[1] not in self.chr_list or elements[ 6] not in self.chr_list: continue fgid = fusion_gene.split( "_")[0] + "_" + up_gene_id + "_" + fusion_gene.split( "_")[1] + "_" + dn_gene_id fusion_map[fgid] = [ fusion_gene, # fusion_gene up_gene_id, # up_gene_bp dn_gene_id, # dn_gene_bp elements[11], # junc_reads_num elements[10], # span_reads_num self.sample_id, "Soapfuse" ] return fusion_map # pizzly - results file is "kallizzy.json.txt" def get_pizzly_results(self): """Load and parse results from pizzly""" pizzly_predict = os.path.join(self.scratch_path, "fusion", "pizzly", "kallizzy.json.txt") fusion_map = {} with open(pizzly_predict, "r") as prediction: next(prediction) # skip header line for line in prediction: elements = line.rstrip().split("\t") # Currently relevant fields (marked *) in the output file are: # * elements[0] ~ geneA.name # elements[1] ~ geneA.id # * elements[2] ~ geneB.name # elements[3] ~ geneB.id # * elements[4] ~ paircount # * elements[5] ~ splitcount # elements[6] ~ transcripts.list # Pizzly is a overpredictor with a high FP ratio in its current version # Therefore, only events supported by at least (paircount + splitcount >=3) are considered if int(elements[4]) + int(elements[5]) < 3: continue fusion_gene = "{0}_{1}".format(elements[0], elements[2]).upper() # check whether fusion gene is not on primary chr - not possible for pizzly as exact breakpoint cannot directly be determined from pizzly output # if elements[0].split(":")[0] not in self.chr_list or elements[1].split(":")[0] not in self.chr_list: # continue fgid = elements[0] + "_1:100:+_" + elements[2] + "_2:100:+" fusion_map[fgid] = [ fusion_gene, # fusion_gene "1:100:+", # up_gene_bp "2:100:+", # dn_gene_bp elements[5], # junc_reads_num elements[4], # span_reads_num self.sample_id, "Pizzly" ] return fusion_map ##### ### parser of fusion tool parser outputs # # urla: at least writing to tool_state_path is sort of obsolete as it is never used downstream of here def concatenate_fusion_results(self, tool_state_path, fusion_output_path): """Return tuple of (dict of results dicts, dict of errors) and writes error/pass to summary file""" with open(tool_state_path, "a") as outf: fusion_result_dict = {} # stores the complete fusion map per tool results_with_errors_dict = { } # stores booleans whether exeption was raised during tool exe self.logger.info("Processing " + self.sample_id) sample_string = self.sample_id for tool in self.tools: fusion_result_dict[tool], results_with_errors_dict[ tool] = self.get_tool_results(fusion_output_path, tool) if results_with_errors_dict[tool]: sample_string += ";0" else: sample_string += ";1" sample_string.rstrip(";") outf.write(sample_string + "\n") return (fusion_result_dict, results_with_errors_dict) def get_tool_results(self, output_folder_path, tool): """Return tuple of (dict of results from individual fusion tools, error type)""" pred_res_dict = {} # dictionary of prediction results self.logger.info("Parsing results for " + tool) try: if tool == "Fusioncatcher": pred_res_dict = self.get_fusioncatcher_results() elif tool == "Starfusion": pred_res_dict = self.get_starfusion_results() elif tool == "Mapsplice": pred_res_dict = self.get_mapsplice_results() elif tool == "Starchip": pred_res_dict = self.get_starchip_results() elif tool == "Infusion": pred_res_dict = self.get_infusion_results() elif tool == "Soapfuse": pred_res_dict = self.get_soapfuse_results() elif tool == "Pizzly": pred_res_dict = self.get_pizzly_results() # pylint exceptions: # the caught exception is not further specified as several different exceptions may be raised during processing # the type of exception is, however, unimportant for further processing, because any exception must be manually reviewed except Exception as ukn_err: # pylint: disable=W0703 self.logger.error( "Couldn't fetch results from {0}, please check data in {1}. Error message: {2}" .format(tool, output_folder_path, ukn_err)) return (pred_res_dict, True) tool_res_file = os.path.join(output_folder_path, tool + "_res.csv") with open(tool_res_file, "w") as tool_outf: tool_outf.write( "fgid;fusion_gene;breakpoint1;breakpoint2;junc_reads;span_reads;sample_id;tool\n" ) for key in pred_res_dict: tool_outf.write(key + ";" + ";".join(pred_res_dict[key]) + "\n") return (pred_res_dict, False) # pylint: enable=W0703 # urla: naming is terribly complicated to understand and follow, changed every single var name! # todo: the method seems to be very complicated for a relatively simple task => is there a more simple solution? # @param dict_of_fusion_results: This is a dictionary of dictionaries. # For each fusion prediction tool, a dictionary is created with keys=fuid # and value=fusion_info; these are themselfes organised in a dict # with keys=fusion_tool and values=fusion_tool_dict def lookup_fusions_in_prediction(self, dict_of_fusion_dicts): """UKN""" # @param dict_of_fusionid_lists: This is a dictionary of lists. # For each fusion tool (=keys of the dict) it contains a # list (=value of the dict) the keys from the respective # fusion tool dict dict_of_fusionid_lists = {} # for every tool in the results dict for fusion_tool in dict_of_fusion_dicts: # if no fusions were predicted by a tool, set an empty list # urla: is this actually required? it would anyway become an empty # list in the for loop, wouldn't it? if len(dict_of_fusion_dicts[fusion_tool]) == 0: # this is a "number of elements" check => pylint: disable=C1801 dict_of_fusionid_lists[fusion_tool] = [] # for each fusion id (=keys of the respective fusion dict), # append them to a list in the new dict # urla: todo: the exception should only be raised, if the list is # not existing (remove generalization) # todo2: why not initializing an empty list for all in advance? # i.e.: putting "dict_of_fusionid_lists[fusion_tool] = []" # at the start of the for loop (would also eliminate the if len(...)) for fusion_id in dict_of_fusion_dicts[fusion_tool]: try: dict_of_fusionid_lists[fusion_tool].append(fusion_id) except KeyError: print( "Error when trying to append to list of tool {0}. Trying to create new list with {1} at start" .format(fusion_tool, fusion_id)) dict_of_fusionid_lists[fusion_tool] = [fusion_id] # create a list of unique fusion ids list_of_all_fusion_ids = [] for tool in dict_of_fusionid_lists: list_of_all_fusion_ids += dict_of_fusionid_lists[tool] list_of_unique_fusion_ids = list(set(list_of_all_fusion_ids)) # @param dict_of_found_uniq_fusions: A dictionary of lists, where keys # are the unique fusion ids and values a list of booleans indicating # whether or not the fusion was found in a fusion prediction tool dict_of_found_uniq_fusions = {} for uniq_fusion_id in list_of_unique_fusion_ids: # split the fusion id into [0]=gene1, [1]=breakpoint of gene1, # [2]=gene2, [3]=breakpoint of gene2 uniq_fusion_id_split = uniq_fusion_id.split("_") list_of_found_fusion_booleans = [] # for each fusion tool for fusion_tool in dict_of_fusion_dicts: found_fusion = False # for each fusion id in the list of fusions per tool for fusion_id in dict_of_fusionid_lists[fusion_tool]: fusion_id_split = fusion_id.split("_") # check if gene1 and gene2 of the unique fusion which is being tested # is present in the current fusion (independent of the orientation) # urla: this will lead to false positive results: # eg: fusion with gene1 "AB1" and gene2 "AB2" will match to fusion "AB11"-"AB22" #if uniq_fusion_id_split[0] in fusion_id_split and uniq_fusion_id_split[2] in fusion_id_split: # urla: possible solution: if ((uniq_fusion_id_split[0] == fusion_id_split[0] and uniq_fusion_id_split[2] == fusion_id_split[2]) or (uniq_fusion_id_split[0] == fusion_id_split[2] and uniq_fusion_id_split[2] == fusion_id_split[0])): found_fusion = True break # we don't need to look further if it was found at least once list_of_found_fusion_booleans.append(found_fusion) if sum(list_of_found_fusion_booleans) >= self.tool_num_cutoff: dict_of_found_uniq_fusions[ uniq_fusion_id] = list_of_found_fusion_booleans return dict_of_found_uniq_fusions def run(self): """ asd """ tool_state_path = os.path.join(self.fusion_output_path, "tool_state.csv") with open(tool_state_path, "w") as tool_state: tool_state.write("Sample ID") for tool in self.tools: tool_state.write(", {}".format(tool)) tool_state.write("\n") detected_fusions_file = os.path.join(self.fusion_output_path, "Detected_Fusions.csv") with open(detected_fusions_file, "w") as fus_file: # write header fus_file.write( "FGID;Fusion_Gene;Breakpoint1;Breakpoint2;Junction_Reads;Spanning_Reads;Sample;Tool\n" ) count_fusions = 0 self.logger.debug("Generating Detected Fusions table") fusion_result_dict, results_with_errors_dict = self.concatenate_fusion_results( tool_state_path, self.fusion_output_path) print(len(fusion_result_dict)) if sum(results_with_errors_dict.values()) == len( fusion_result_dict): self.logger.error( "Fusion parsing failed completely. Revision required. Aborting." ) sys.exit(1) elif sum(results_with_errors_dict.values()) != 0: self.logger.error( "Results incomplete. Please make sure that all tools have run completely on dataset." ) dict_of_boolean_list_of_found_uniq_fusions = self.lookup_fusions_in_prediction( fusion_result_dict) # this is snake case, although too long... pylint: disable=C0103 # for each unique fusion for uniq_fusion_id in dict_of_boolean_list_of_found_uniq_fusions: # for each fusion tool for fusion_tool_num_in_list, fusion_tool in enumerate( fusion_result_dict, 0): # if the unique fusion was found in a fusion tool if dict_of_boolean_list_of_found_uniq_fusions[ uniq_fusion_id][fusion_tool_num_in_list]: # for each fusion id of fusion tool X for fusion_id in fusion_result_dict[fusion_tool]: # if the fusion id of the tool matches the unique fusion id, write everything to file # urla: it is probably better to iterate over all fusion, instead of using "get" on the dict # because a fusion gene can be called more than once with sligthly different breakpoints if fusion_id == uniq_fusion_id: count_fusions += 1 fus_file.write( uniq_fusion_id + ";" + ";".join(fusion_result_dict[fusion_tool] [fusion_id]) + "\n") self.logger.info("Wrote {0} detected fusion genes to {1}.".format( count_fusions, detected_fusions_file))
def eval_patch_shuffle(model, dataset_builder, max_num_devide: int, num_samples: int, batch_size: int, num_workers: int, top_k: int, log_dir: str, log_params: dict = {}, suffix: str = '', shuffle: bool = False, **kwargs): """ Args - model: NN model - dataset_builder: DatasetBuilder class object - max_num_devide: max number of division - num_samples: number of sample to use. if -1, all samples are used - batch_size: size of batch - num_workers: number of workers - top_k: use top_k accuracy - log_dir: log directory - log_params: params which is logged in dataframe. these params are useful for legend. - suffix: suffix of log - shuffle: shuffle data """ assert max_num_devide >= 1 assert num_samples >= 1 or num_samples == -1 assert batch_size >= 1 assert num_workers >= 1 assert top_k >= 1 log_path = os.path.join( log_dir, os.path.join('pathch_shuffle_result' + suffix + '.csv')) logger = Logger(path=log_path, mode='test') # log params # logger.log(log_params) acc_dict = {} images_list = [] for num_devide in tqdm.tqdm(range(1, max_num_devide + 1)): log_dict = collections.OrderedDict() # build Patch Shuffled dataset patch_shuffle_transform = PatchShuffle(num_devide, num_devide) dataset = dataset_builder(train=False, normalize=True, optional_transform=[patch_shuffle_transform]) if num_samples != -1: num_samples = min(num_samples, len(dataset)) indices = [i for i in range(num_samples)] dataset = torch.utils.data.Subset(dataset, indices) loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True) with torch.autograd.no_grad(): num_correct = 0.0 for i, (x, t) in enumerate(loader): model.eval() x = x.to('cuda', non_blocking=True) t = t.to('cuda', non_blocking=True) model.zero_grad() logit = model(x) num_correct += get_num_correct(logit, t, topk=top_k) if i == 0: images_list.append(x[10]) acc = num_correct / float(len(dataset)) key = '{num_devide}'.format(num_devide=num_devide) acc_dict[key] = acc log_dict['num_devide'] = num_devide log_dict['accuracy'] = acc logger.log(log_dict) print(acc_dict) # save data torch.save( acc_dict, os.path.join(log_dir, 'patch_shuffle_acc_dict' + suffix + '.pth')) torchvision.utils.save_image(torch.stack(images_list, dim=0), os.path.join( log_dir, 'example_images' + suffix + '.png'), nrow=max_num_devide) plot(csv_path=log_path, x='num_devide', y='accuracy', hue=None, log_path=os.path.join(log_dir, 'plot.png'), save=True)