def create_file(self): # Expanding summits tfbs_summit_regions = GenomicRegionSet("TFBS Summit Regions") tfbs_summit_regions.read_bed(self.tfbs_summit_fname) for region in iter(tfbs_summit_regions): summit = int(region.data.split()[-1]) + region.initial region.initial = max(summit - (self.peak_ext / 2), 0) region.final = summit + (self.peak_ext / 2) # Calculating intersections mpbs_regions = GenomicRegionSet("MPBS Regions") mpbs_regions.read_bed(self.mpbs_fname) tfbs_summit_regions.sort() mpbs_regions.sort() with_overlap_regions = mpbs_regions.intersect(tfbs_summit_regions, mode=OverlapType.ORIGINAL) without_overlap_regions = mpbs_regions.subtract(tfbs_summit_regions, whole_region=True) tfbs_regions = GenomicRegionSet("TFBS Regions") for region in iter(with_overlap_regions): region.name = region.name.split(":")[0] + ":Y" tfbs_regions.add(region) for region in iter(without_overlap_regions): region.name = region.name.split(":")[0] + ":N" tfbs_regions.add(region) tfbs_regions.sort() tfbs_fname = os.path.join(self.output_location, "{}.bed".format(self.mpbs_name)) tfbs_regions.write_bed(tfbs_fname)
def fisher_table(motif_name, regions, mpbs, gene_set=False, mpbs_set=False): """ TODO Keyword arguments: motif_name -- TODO regions -- TODO mpbs -- TODO gene_set -- TODO mpbs_set -- TODO Return: a -- TODO b -- TODO gene_set -- TODO mpbs_set -- TODO """ # Fetching motif mpbs_motif = GenomicRegionSet(name="mpbs_motif") for region in mpbs.sequences: if motif_name in region.name: mpbs_motif.add(region) # Performing intersections if len(mpbs_motif) > 0: # regions which are overlapping with mpbs_motif intersect_original = regions.intersect(mpbs_motif, mode=OverlapType.ORIGINAL, rm_duplicates=True) # regions which are not overlapping with regions from mpbs_motif subtract_overlap = regions.subtract(mpbs_motif, whole_region=True) # Fetching genes if gene_set: gene_set_res = GeneSet(motif_name) for genomic_region in intersect_original.sequences: if genomic_region.name: gene_list = [e if e[0] != "." else e[1:] for e in genomic_region.name.split(":")] for g in gene_list: gene_set_res.genes.append(g) gene_set_res.genes = list(set(gene_set_res.genes)) # Keep only unique genes else: gene_set_res = None # Fetching mpbs if mpbs_set: mpbs_set_res = mpbs_motif.intersect(regions, mode=OverlapType.ORIGINAL, rm_duplicates=True) else: mpbs_set_res = None return len(intersect_original), len(subtract_overlap), gene_set_res, mpbs_set_res else: gene_set_res = GeneSet(motif_name) if gene_set else None mpbs_set_res = GenomicRegionSet(mpbs_motif.name) if mpbs_set else None return 0, len(regions), gene_set_res, mpbs_set_res
def create_file(self): # Expanding summits tfbs_summit_regions = GenomicRegionSet("TFBS Summit Regions") tfbs_summit_regions.read_bed(self.tfbs_summit_fname) for region in iter(tfbs_summit_regions): summit = int(region.data.split()[-1]) + region.initial region.initial = max(summit - (self.peak_ext / 2), 0) region.final = summit + (self.peak_ext / 2) # Calculating intersections mpbs_regions = GenomicRegionSet("MPBS Regions") mpbs_regions.read_bed(self.mpbs_fname) tfbs_summit_regions.sort() mpbs_regions.sort() with_overlap_regions = mpbs_regions.intersect( tfbs_summit_regions, mode=OverlapType.ORIGINAL) without_overlap_regions = mpbs_regions.subtract(tfbs_summit_regions, whole_region=True) tfbs_regions = GenomicRegionSet("TFBS Regions") for region in iter(with_overlap_regions): region.name = region.name.split(":")[0] + ":Y" tfbs_regions.add(region) for region in iter(without_overlap_regions): region.name = region.name.split(":")[0] + ":N" tfbs_regions.add(region) tfbs_regions.sort() tfbs_fname = os.path.join(self.output_location, "{}.bed".format(self.mpbs_name)) tfbs_regions.write_bed(tfbs_fname)
def chip_evaluate(args): # Evaluate Statistics fpr = dict() tpr = dict() roc_auc_1 = dict() roc_auc_10 = dict() roc_auc_50 = dict() roc_auc_100 = dict() recall = dict() precision = dict() prc_auc_1 = dict() prc_auc_10 = dict() prc_auc_50 = dict() prc_auc_100 = dict() footprint_file = args.footprint_file.split(",") footprint_name = args.footprint_name.split(",") footprint_type = args.footprint_type.split(",") max_score = 0 if "SEG" in footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read(args.tfbs_file) # Verifying the maximum score of the MPBS file for region in iter(mpbs_regions): score = int(region.data.split("\t")[0]) if score > max_score: max_score = score max_score += 1 for i in range(len(footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read(footprint_file[i]) footprints_regions.sort() if footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data.split("\t")[0]) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(increased_score_mpbs_regions) elif footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(footprints_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(footprints_regions) # Output the statistics results into text stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix)) stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(footprint_name)): stats_file.write(footprint_name[i] + "\t" + str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" + str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n") # Output the curves if args.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix, curve_name) if args.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y, args.output_prefix, curve_name) output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
def chip_evaluate(self): """ This evaluation methodology uses motif-predicted binding sites (MPBSs) together with TF ChIP-seq data to evaluate the footprint predictions. return: """ # Evaluate Statistics fpr = dict() tpr = dict() roc_auc = dict() roc_auc_1 = dict() roc_auc_2 = dict() recall = dict() precision = dict() prc_auc = dict() if "SEG" in self.footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read_bed(self.tfbs_file) mpbs_regions.sort() # Verifying the maximum score of the MPBS file max_score = -99999999 for region in iter(mpbs_regions): score = int(region.data) if score > max_score: max_score = score max_score += 1 for i in range(len(self.footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read_bed(self.footprint_file[i]) # Sort footprint prediction bed files footprints_regions.sort() if self.footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(increased_score_mpbs_regions) elif self.footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(footprints_regions) recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(footprints_regions) # Output the statistics results into text stats_fname = self.output_location + self.tf_name + "_stats.txt" stats_header = ["METHOD", "AUC_100", "AUC_10", "AUC_1", "AUPR"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(self.footprint_name)): stats_file.write(self.footprint_name[i] + "\t" + str(roc_auc[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(roc_auc_2[i]) + "\t" + str(prc_auc[i]) + "\n") # Output the curves if self.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" self.plot_curve(fpr, tpr, roc_auc, label_x, label_y, self.tf_name, curve_name) if self.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" self.plot_curve(recall, precision, prc_auc, label_x, label_y, self.tf_name, curve_name) self.output_points(self.tf_name, fpr, tpr, recall, precision)
def chip_evaluate(args): # Evaluate Statistics fpr = dict() tpr = dict() roc_auc_1 = dict() roc_auc_10 = dict() roc_auc_50 = dict() roc_auc_100 = dict() recall = dict() precision = dict() prc_auc_1 = dict() prc_auc_10 = dict() prc_auc_50 = dict() prc_auc_100 = dict() footprint_file = args.footprint_file.split(",") footprint_name = args.footprint_name.split(",") footprint_type = args.footprint_type.split(",") max_score = 0 if "SEG" in footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read(args.tfbs_file) # Verifying the maximum score of the MPBS file for region in iter(mpbs_regions): score = int(region.data.split("\t")[0]) if score > max_score: max_score = score max_score += 1 max_points = [] for i in range(len(footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read(footprint_file[i]) footprints_regions.sort() if footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data.split("\t")[0]) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(increased_score_mpbs_regions) max_points.append(len(intersect_regions)) elif footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(footprints_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(footprints_regions) max_points.append(len(footprints_regions)) # Output the statistics results into text stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix)) stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(footprint_name)): stats_file.write(footprint_name[i] + "\t" + str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" + str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n") # Output the curves if args.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix, curve_name, max_points=max_points) if args.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y, args.output_prefix, curve_name, max_points=max_points) output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
def chip_evaluate(self): """ This evaluation methodology uses motif-predicted binding sites (MPBSs) together with TF ChIP-seq data to evaluate the footprint predictions. return: """ # Evaluate Statistics fpr = dict() tpr = dict() roc_auc = dict() roc_auc_1 = dict() roc_auc_2 = dict() recall = dict() precision = dict() prc_auc = dict() if "SEG" in self.footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read_bed(self.tfbs_file) mpbs_regions.sort() # Verifying the maximum score of the MPBS file max_score = -99999999 for region in iter(mpbs_regions): score = int(region.data) if score > max_score: max_score = score max_score += 1 for i in range(len(self.footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read_bed(self.footprint_file[i]) # Sort footprint prediction bed files footprints_regions.sort() if self.footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet( "Increased Regions") intersect_regions = mpbs_regions.intersect( footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract( footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[ i] = self.roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc[ i] = self.precision_recall_curve( increased_score_mpbs_regions) elif self.footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[ i] = self.roc_curve(footprints_regions) recall[i], precision[i], prc_auc[ i] = self.precision_recall_curve(footprints_regions) # Output the statistics results into text stats_fname = self.output_location + self.tf_name + "_stats.txt" stats_header = ["METHOD", "AUC_100", "AUC_10", "AUC_1", "AUPR"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(self.footprint_name)): stats_file.write(self.footprint_name[i] + "\t" + str(roc_auc[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(roc_auc_2[i]) + "\t" + str(prc_auc[i]) + "\n") # Output the curves if self.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" self.plot_curve(fpr, tpr, roc_auc, label_x, label_y, self.tf_name, curve_name) if self.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" self.plot_curve(recall, precision, prc_auc, label_x, label_y, self.tf_name, curve_name) self.output_points(self.tf_name, fpr, tpr, recall, precision)
line[6], line[2], str(min(donor)), str(max(acceptor)), "255,0,0", "2", ",".join([ str(abs(donor[1] - donor[0])), str(abs(acceptor[1] - acceptor[0])) ]), "0,"+str(abs(min(donor)-min(acceptor))) ]), file=g) else: pass #print(line) #sys.exit() print("tcons:\t" + args.t) tcons = GenomicRegionSet("tcons") tcons.read_bed(args.t) circrna = GenomicRegionSet("circRNA") circrna.read_bed(args.o) circ_inTCON = circrna.intersect(y=tcons, mode = OverlapType.COMP_INCL) circ_TCONs = tcons.intersect(y=circ_inTCON, mode = OverlapType.ORIGINAL) #print(len(circ_TCONs)) circ_TCONs.write_bed(args.c) # 0 1 2 3 4 5 6 7 8 # chr1 39449029 + chr1 39448068 + 0 0 0 # 9 10 11 12 13 # 97ZZTR1:411:C4VC3ACXX:5:1102:16097:34171 39448994 35M15S 39448069 35S15M868p50M ############### FASTA slicing ####################################### elif args.mode == "sliceFASTA": print(os.path.basename(args.i) + " -start "+str(args.p)+" -end "+str(args.p+args.l)) from rgt.SequenceSet import SequenceSet seq = SequenceSet(name=args.i, seq_type="RNA")
def fisher_table(motif_name, regions, mpbs, gene_set=False, mpbs_set=False): """ TODO Keyword arguments: motif_name -- TODO regions -- TODO mpbs -- TODO gene_set -- TODO mpbs_set -- TODO Return: a -- TODO b -- TODO gene_set -- TODO mpbs_set -- TODO """ # Fetching motif mpbs_motif = GenomicRegionSet(name="mpbs_motif") for region in mpbs.sequences: if motif_name in region.name: mpbs_motif.add(region) # Performing intersections if len(mpbs_motif) > 0: # regions which are overlapping with mpbs_motif intersect_original = regions.intersect(mpbs_motif, mode=OverlapType.ORIGINAL, rm_duplicates=True) # regions which are not overlapping with regions from mpbs_motif subtract_overlap = regions.subtract(mpbs_motif, whole_region=True) # Fetching genes if gene_set: gene_set_res = GeneSet(motif_name) for genomic_region in intersect_original.sequences: if genomic_region.name: gene_list = [ e if e[0] != "." else e[1:] for e in genomic_region.name.split(":") ] for g in gene_list: gene_set_res.genes.append(g) gene_set_res.genes = list(set( gene_set_res.genes)) # Keep only unique genes else: gene_set_res = None # Fetching mpbs if mpbs_set: mpbs_set_res = mpbs_motif.intersect(regions, mode=OverlapType.ORIGINAL, rm_duplicates=True) else: mpbs_set_res = None return len(intersect_original), len( subtract_overlap), gene_set_res, mpbs_set_res else: gene_set_res = GeneSet(motif_name) if gene_set else None mpbs_set_res = GenomicRegionSet(mpbs_motif.name) if mpbs_set else None return 0, len(regions), gene_set_res, mpbs_set_res