def collectRptScout(DIR, tool): # FIRST: Get any applicable RM output stats for org in data_map.keys(): for f in f_list: RS_job_file = DIR + "/../job_log/{prefix}.{org}.s0.f{f}".format(prefix=tool_prefix[tool], org=org, f=f) RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s0.f{f}".format(prefix=tool_prefix[tool], org=org, f=f) RS_dir = DIR + "/" + ("{org}.s0.f{f}".format(org=org, f=f)).upper() RM_file = RS_dir + "/" + "{org}.fa.out".format(org=org, f=f) blast_file = RS_dir + "/" + "{org}.s0.f{f}.RS.blast.6.txt.bz2".format(org=org, f=f) pra_output = "{DIR}/{org}.s0.f{f}.pra.txt".format(DIR=RS_dir, org=org, f=f) tool_output = RM_file real_repeats = data_map[org] + ".out" H = create_stats_hash(tool, org, None, int(f)) # Get stats from RM run try: Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None) H["tp"], H["fp"], H["fn"], H["tn"] = Counts H["tpr"], H["tnr"], H["ppv"], H["npv"], H["fpr"], H["fdr"] = Stats except Exception as E: pass # raise E; # Get resource usage from RPT_SCOUT run if os.path.exists(RS_job_file): p = redhawk.loadPBS(open(RS_job_file, "rb"))[0] try: if p.efile_exists(): H["ToolCpuTime"], H["ToolWallTime"], H["ToolMem"], H["ToolVMem"] = p.getResources() except: pass redhawk.storePBS([p], open(RS_job_file, "wb")) # Get resource usage from RM run if os.path.exists(RM_job_file): p = redhawk.loadPBS(open(RM_job_file, "rb"))[0] try: if p.efile_exists(): H["RMCpuTime"], H["RMWallTime"], H["RMMem"], H["RMVMem"] = p.getResources() except: pass redhawk.storePBS([p], open(RM_job_file, "wb")) if os.path.exists(blast_file): cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format( blast_output=blast_file, output=pra_output ) subprocess.call(cmd, shell=True) query_cover, target_cover, Used = parse_pra_output.parse_pra_output(pra_output, "exclude.txt") H["ConCoverage"], H["QuCoverage"] = query_cover, target_cover stats_map[(tool, org, None, f)] = H return None
def collectRaider(DIR, tool): # FIRST: Get any applicable RM output stats for org in data_map.keys(): for seed_num in seed_map.keys(): for f in f_list: print("File: " + org + " " + str(seed_num) + " " + str(f) + "\n") RAIDER_job_file = DIR + "/../job_log/{prefix}.{org}.s{seed_num}.f{f}".format( prefix=tool_prefix[tool], org=org, seed_num=seed_num, f=f ) RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s{seed_num}.f{f}".format( prefix=tool_prefix[tool], org=org, seed_num=seed_num, f=f ) RM_dir = DIR + "/" + ("{org}.s{seed}.f{f}".format(org=org, seed=seed_num, f=f)).upper() RM_file = RM_dir + "/" + "{org}.fa.out".format(org=org, seed=seed_num, f=f) blast_file = RM_dir + "/" + "{org}.s{seed}.f{f}.blast.6.txt.bz2".format(org=org, seed=seed_num, f=f) pra_output = RM_dir + "/" + "{org}.s{seed}.f{f}.pra.txt".format(org=org, seed=seed_num, f=f) tool_output = RM_file real_repeats = data_map[org] + ".out" H = create_stats_hash(tool, org, int(seed_num), int(f)) seed = convert_seed(seed_map[seed_num]) seed_len = len(seed) seed_weight = seed.count("1") seed_ratio = seed_weight / (float(seed_len)) H["l"] = seed_len H["w"] = seed_weight H["w/l"] = seed_ratio # Get stats from RM run try: Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None) H["tp"], H["fp"], H["fn"], H["tn"] = Counts H["tpr"], H["tnr"], H["ppv"], H["npv"], H["fpr"], H["fdr"] = Stats except Exception as E: pass # raise E; # Get resource usage from RAIDER run if os.path.exists(RAIDER_job_file): p = redhawk.loadPBS(open(RAIDER_job_file, "rb"))[0] try: if p.efile_exists(): H["ToolCpuTime"], H["ToolWallTime"], H["ToolMem"], H["ToolVMem"] = p.getResources() except: pass redhawk.storePBS([p], open(RAIDER_job_file, "wb")) # Get resource usage from RM run if os.path.exists(RM_job_file): p = redhawk.loadPBS(open(RM_job_file, "rb"))[0] try: if p.efile_exists(): H["RMCpuTime"], H["RMWallTime"], H["RMMem"], H["RMVMem"] = p.getResources() except: pass redhawk.storePBS([p], open(RM_job_file, "wb")) if os.path.exists(blast_file): if not os.path.exists(pra_output): cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format( blast_output=blast_file, output=pra_output ) print("cmd: " + cmd) # subprocess.call(cmd, shell=True) query_cover, target_cover, Used = parse_pra_output.parse_pra_output(pra_output, "exclude.txt") H["ConCoverage"], H["QuCoverage"] = query_cover, target_cover stats_map[(tool, org, seed_num, f)] = H return None
def collectRaider(DIR, tool): # FIRST: Get any applicable RM output stats for org in data_map.keys(): for seed_num in seed_map.keys(): for f in f_list: print("File: " + org + " " + str(seed_num) + " " + str(f)) RAIDER_job_file = DIR + "/../job_log/{prefix}.{org}.s{seed_num}.f{f}".format(prefix = tool_prefix[tool], org=org, seed_num=seed_num, f=f) RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s{seed_num}.f{f}".format(prefix = tool_prefix[tool], org=org, seed_num=seed_num, f=f) RM_dir = DIR + "/" + ("{org}.s{seed}.f{f}".format(org=org, seed=seed_num, f=f)).upper() RM_file = RM_dir + "/" + "{org}.fa.out".format(org=org, seed=seed_num, f=f) blast_file = RM_dir + "/" + "{org}.s{seed}.f{f}.blast.6.txt.bz2".format(org=org, seed=seed_num, f=f) pra_output = RM_dir + "/" + "{org}.s{seed}.f{f}.pra.txt".format(org=org, seed=seed_num, f=f) tool_output = RM_file real_repeats = data_map[org] + ".out" H = create_stats_hash(tool, org, int(seed_num), int(f)) seed = convert_seed(seed_map[seed_num]) seed_len = len(seed) seed_weight = seed.count("1") seed_ratio = seed_weight / (float(seed_len)) H['l'] = seed_len H['w'] = seed_weight H['w/l'] = seed_ratio # Get stats from RM run try: negatives, fp, fp_d, positives, tp, famHash = rm_analysis.collect_stats(real_repeats, tool_output, fp_dist) H['tp'] = tp H['fp'] = fp # DEBUG: NEED TO DOUBLE-CHECK THIS!!! H['tn'] = negatives - H['fp'] H['fn'] = positives - H['tp'] H['tpr'] = H['tp'] / positives H['tnr'] = H['tn'] / negatives H['ppv'] = H['tp'] / (H['tp'] + H['fp']) H['npv'] = H['tn'] / (H['tn'] + H['fn']) H['fpr'] = H['fp'] / negatives H['fnr'] = 1 - H['tpr'] H['dfr'] = 1 - H['ppv'] #Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None) #H['tp'], H['fp'], H['fn'], H['tn'] = Counts #H['tpr'], H['tnr'], H['ppv'], H['npv'], H['fpr'], H['fdr'] = Stats except Exception as E: #raise E; pass # Get resource usage from RAIDER run if os.path.exists(RAIDER_job_file): p = redhawk.loadPBS(open(RAIDER_job_file, "rb"))[0] try: if p.efile_exists(): H['ToolCpuTime'], H['ToolWallTime'], H['ToolMem'], H['ToolVMem'] = p.getResources() except: pass redhawk.storePBS([p], open(RAIDER_job_file, "wb")) # Get resource usage from RM run if os.path.exists(RM_job_file): p = redhawk.loadPBS(open(RM_job_file, "rb"))[0] try: if p.efile_exists(): H['RMCpuTime'], H['RMWallTime'], H['RMMem'], H['RMVMem'] = p.getResources() except: pass redhawk.storePBS([p], open(RM_job_file, "wb")) #print("BF: " + blast_file) #print("PRA: " + pra_output) if os.path.exists(blast_file): if not os.path.exists(pra_output): cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format(blast_output=blast_file, output=pra_output) #print("cmd: " + cmd) subprocess.call(cmd, shell=True) query_cover, target_cover, Used = parse_pra_output.parse_pra_output(pra_output, "exclude.txt") H['ConCoverage'], H['QuCoverage'] = query_cover, target_cover stats_map[(tool,org,seed_num,f)] = H return None
def collectRptScout(DIR, tool): # FIRST: Get any applicable RM output stats for org in data_map.keys(): for f in f_list: RS_job_file = DIR + "/../job_log/{prefix}.{org}.s0.f{f}".format(prefix = tool_prefix[tool], org=org, f=f) RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s0.f{f}".format(prefix = tool_prefix[tool], org=org, f=f) RS_dir = DIR + "/" + ("{org}.s0.f{f}".format(org=org, f=f)).upper() RM_file = RS_dir + "/" + "{org}.fa.out".format(org=org, f=f) blast_file = RS_dir + "/" + "{org}.s0.f{f}.RS.blast.6.txt.bz2".format(org=org, f=f) pra_output = "{DIR}/{org}.s0.f{f}.pra.txt".format(DIR=RS_dir, org=org, f=f) tool_output = RM_file real_repeats = data_map[org] + ".out" H = create_stats_hash(tool, org, None, int(f)) # Get stats from RM run try: negatives, fp, fp_d, positives, tp, famHash = rm_analysis.collect_stats(real_repeats, tool_output, fp_dist) H['tp'] = tp H['fp'] = fp # DEBUG: NEED TO DOUBLE-CHECK THIS!!! H['tn'] = negatives - H['fp'] H['fn'] = positives - H['tp'] H['tpr'] = H['tp'] / positives #print(H['tpr']) H['tnr'] = H['tn'] / negatives H['ppv'] = H['tp'] / (H['tp'] + H['fp']) H['npv'] = H['tn'] / (H['tn'] + H['fn']) H['fpr'] = H['fp'] / negatives H['fnr'] = 1 - H['tpr'] H['dfr'] = 1 - H['ppv'] #Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None) #H['tp'], H['fp'], H['fn'], H['tn'] = Counts #H['tpr'], H['tnr'], H['ppv'], H['npv'], H['fpr'], H['fdr'] = Stats except Exception as E: pass #raise E; # Get resource usage from RPT_SCOUT run if os.path.exists(RS_job_file): p = redhawk.loadPBS(open(RS_job_file, "rb"))[0] try: if p.efile_exists(): H['ToolCpuTime'], H['ToolWallTime'], H['ToolMem'], H['ToolVMem'] = p.getResources() except: pass redhawk.storePBS([p], open(RS_job_file, "wb")) # Get resource usage from RM run if os.path.exists(RM_job_file): p = redhawk.loadPBS(open(RM_job_file, "rb"))[0] try: if p.efile_exists(): H['RMCpuTime'], H['RMWallTime'], H['RMMem'], H['RMVMem'] = p.getResources() except: pass redhawk.storePBS([p], open(RM_job_file, "wb")) if os.path.exists(blast_file): cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format(blast_output=blast_file, output=pra_output) subprocess.call(cmd, shell=True) query_cover, target_cover, Used = parse_pra_output.parse_pra_output(pra_output, "exclude.txt") H['ConCoverage'], H['QuCoverage'] = query_cover, target_cover stats_map[(tool,org,None,f)] = H return None
def collectRptScout(DIR, tool): # FIRST: Get any applicable RM output stats for org in data_map.keys(): for f in f_list: RS_job_file = DIR + "/../job_log/{prefix}.{org}.s0.f{f}".format( prefix=tool_prefix[tool], org=org, f=f) RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s0.f{f}".format( prefix=tool_prefix[tool], org=org, f=f) RS_dir = DIR + "/" + ("{org}.s0.f{f}".format(org=org, f=f)).upper() RM_file = RS_dir + "/" + "{org}.fa.out".format(org=org, f=f) blast_file = RS_dir + "/" + "{org}.s0.f{f}.RS.blast.6.txt.bz2".format( org=org, f=f) pra_output = "{DIR}/{org}.s0.f{f}.pra.txt".format(DIR=RS_dir, org=org, f=f) tool_output = RM_file real_repeats = data_map[org] + ".out" H = create_stats_hash(tool, org, None, int(f)) # Get stats from RM run try: negatives, fp, fp_d, positives, tp, famHash = rm_analysis.collect_stats( real_repeats, tool_output, fp_dist) H['tp'] = tp H['fp'] = fp # DEBUG: NEED TO DOUBLE-CHECK THIS!!! H['tn'] = negatives - H['fp'] H['fn'] = positives - H['tp'] H['tpr'] = H['tp'] / positives #print(H['tpr']) H['tnr'] = H['tn'] / negatives H['ppv'] = H['tp'] / (H['tp'] + H['fp']) H['npv'] = H['tn'] / (H['tn'] + H['fn']) H['fpr'] = H['fp'] / negatives H['fnr'] = 1 - H['tpr'] H['dfr'] = 1 - H['ppv'] #Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None) #H['tp'], H['fp'], H['fn'], H['tn'] = Counts #H['tpr'], H['tnr'], H['ppv'], H['npv'], H['fpr'], H['fdr'] = Stats except Exception as E: pass #raise E; # Get resource usage from RPT_SCOUT run if os.path.exists(RS_job_file): p = redhawk.loadPBS(open(RS_job_file, "rb"))[0] try: if p.efile_exists(): H['ToolCpuTime'], H['ToolWallTime'], H['ToolMem'], H[ 'ToolVMem'] = p.getResources() except: pass redhawk.storePBS([p], open(RS_job_file, "wb")) # Get resource usage from RM run if os.path.exists(RM_job_file): p = redhawk.loadPBS(open(RM_job_file, "rb"))[0] try: if p.efile_exists(): H['RMCpuTime'], H['RMWallTime'], H['RMMem'], H[ 'RMVMem'] = p.getResources() except: pass redhawk.storePBS([p], open(RM_job_file, "wb")) if os.path.exists(blast_file): cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format( blast_output=blast_file, output=pra_output) subprocess.call(cmd, shell=True) query_cover, target_cover, Used = parse_pra_output.parse_pra_output( pra_output, "exclude.txt") H['ConCoverage'], H['QuCoverage'] = query_cover, target_cover stats_map[(tool, org, None, f)] = H return None
def collectNaive(DIR, tool): # NAIVE # FIRST: Get any applicable RM output stats assert (tool == 'naive') for org in data_map.keys(): for seed_num in seed_map.keys(): for f in f_list: print("File: " + org + " " + str(seed_num) + " " + str(f)) NAIVE_job_file = DIR + "/../job_log/{prefix}.{org}.s{seed_num}.f{f}".format( prefix=tool_prefix[tool], org=org, seed_num=seed_num, f=f) RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s{seed_num}.f{f}".format( prefix=tool_prefix[tool], org=org, seed_num=seed_num, f=f) RM_dir = DIR + "/" + ("{org}.s{seed}.f{f}".format( org=org, seed=seed_num, f=f)).upper() RM_file = RM_dir + "/" + "{org}.fa.out".format( org=org, seed=seed_num, f=f) blast_file = RM_dir + "/" + "{org}.s{seed}.f{f}.blast.6.txt.bz2".format( org=org, seed=seed_num, f=f) pra_output = RM_dir + "/" + "{org}.s{seed}.f{f}.pra.txt".format( org=org, seed=seed_num, f=f) tool_output = RM_file real_repeats = data_map[org] + ".out" H = create_stats_hash(tool, org, int(seed_num), int(f)) seed = convert_seed(seed_map[seed_num]) seed_len = len(seed) seed_weight = seed.count("1") seed_ratio = seed_weight / (float(seed_len)) H['l'] = seed_len H['w'] = seed_weight H['w/l'] = seed_ratio # Get stats from RM run try: negatives, fp, fp_d, positives, tp, famHash = rm_analysis.collect_stats( real_repeats, tool_output, fp_dist) H['tp'] = tp H['fp'] = fp # DEBUG: NEED TO DOUBLE-CHECK THIS!!! H['tn'] = negatives - H['fp'] H['fn'] = positives - H['tp'] H['tpr'] = H['tp'] / positives #print(H['tpr']) H['tnr'] = H['tn'] / negatives H['ppv'] = H['tp'] / (H['tp'] + H['fp']) H['npv'] = H['tn'] / (H['tn'] + H['fn']) H['fpr'] = H['fp'] / negatives H['fnr'] = 1 - H['tpr'] H['dfr'] = 1 - H['ppv'] #Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None) #H['tp'], H['fp'], H['fn'], H['tn'] = Counts #H['tpr'], H['tnr'], H['ppv'], H['npv'], H['fpr'], H['fdr'] = Stats except Exception as E: pass #raise E; # Get resource usage from NAIVE run if os.path.exists(NAIVE_job_file): p = redhawk.loadPBS(open(NAIVE_job_file, "rb"))[0] try: if p.efile_exists(): H['ToolCpuTime'], H['ToolWallTime'], H[ 'ToolMem'], H['ToolVMem'] = p.getResources() except: pass redhawk.storePBS([p], open(NAIVE_job_file, "wb")) # Get resource usage from RM run if os.path.exists(RM_job_file): p = redhawk.loadPBS(open(RM_job_file, "rb"))[0] try: if p.efile_exists(): H['RMCpuTime'], H['RMWallTime'], H['RMMem'], H[ 'RMVMem'] = p.getResources() except: pass redhawk.storePBS([p], open(RM_job_file, "wb")) if os.path.exists(blast_file): if not os.path.exists(pra_output): cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format( blast_output=blast_file, output=pra_output) #print("cmd: " + cmd) subprocess.call(cmd, shell=True) query_cover, target_cover, Used = parse_pra_output.parse_pra_output( pra_output, "exclude.txt") H['ConCoverage'], H[ 'QuCoverage'] = query_cover, target_cover stats_map[(tool, org, seed_num, f)] = H return None