def create_file(self): # Expanding summits tfbs_summit_regions = GenomicRegionSet("TFBS Summit Regions") tfbs_summit_regions.read_bed(self.tfbs_summit_fname) for region in iter(tfbs_summit_regions): summit = int(region.data.split()[-1]) + region.initial region.initial = max(summit - (self.peak_ext / 2), 0) region.final = summit + (self.peak_ext / 2) # Calculating intersections mpbs_regions = GenomicRegionSet("MPBS Regions") mpbs_regions.read_bed(self.mpbs_fname) tfbs_summit_regions.sort() mpbs_regions.sort() with_overlap_regions = mpbs_regions.intersect(tfbs_summit_regions, mode=OverlapType.ORIGINAL) without_overlap_regions = mpbs_regions.subtract(tfbs_summit_regions, whole_region=True) tfbs_regions = GenomicRegionSet("TFBS Regions") for region in iter(with_overlap_regions): region.name = region.name.split(":")[0] + ":Y" tfbs_regions.add(region) for region in iter(without_overlap_regions): region.name = region.name.split(":")[0] + ":N" tfbs_regions.add(region) tfbs_regions.sort() tfbs_fname = os.path.join(self.output_location, "{}.bed".format(self.mpbs_name)) tfbs_regions.write_bed(tfbs_fname)
def subtract(self, x): """ Subtract GenomicVariantSet. *Keyword arguments:* - x -- instance of GenomicVariantSet which is subtracted """ tmp = GenomicRegionSet.subtract(self, x, whole_region=False) self.sequences = self._reconstruct_info(tmp)
def test_subtract_exact(self): reference = GenomicRegionSet("reference") reference.read(os.path.join(os.path.dirname(__file__), "test_result.bed")) background = GenomicRegionSet("background") background.read(os.path.join(os.path.dirname(__file__), "test_background.bed")) target = GenomicRegionSet("target") target.read(os.path.join(os.path.dirname(__file__), "test_target.bed")) background_tmp = background.subtract(target, exact=True) reference.sort() self.assertEqual(len(background_tmp.sequences), len(reference.sequences)) for region, region_ref in zip(background_tmp.sequences, reference.sequences): self.assertEqual(region.__cmp__(region_ref), 0)
def test_subtract_exact(self): reference = GenomicRegionSet("reference") reference.read( os.path.join(os.path.dirname(__file__), "test_result.bed")) background = GenomicRegionSet("background") background.read( os.path.join(os.path.dirname(__file__), "test_background.bed")) target = GenomicRegionSet("target") target.read(os.path.join(os.path.dirname(__file__), "test_target.bed")) background_tmp = background.subtract(target, exact=True) reference.sort() self.assertEqual(len(background_tmp.sequences), len(reference.sequences)) for region, region_ref in zip(background_tmp.sequences, reference.sequences): self.assertEqual(region, region_ref)
def create_file(self): # Expanding summits tfbs_summit_regions = GenomicRegionSet("TFBS Summit Regions") tfbs_summit_regions.read_bed(self.tfbs_summit_fname) for region in iter(tfbs_summit_regions): summit = int(region.data.split()[-1]) + region.initial region.initial = max(summit - (self.peak_ext / 2), 0) region.final = summit + (self.peak_ext / 2) # Calculating intersections mpbs_regions = GenomicRegionSet("MPBS Regions") mpbs_regions.read_bed(self.mpbs_fname) tfbs_summit_regions.sort() mpbs_regions.sort() with_overlap_regions = mpbs_regions.intersect( tfbs_summit_regions, mode=OverlapType.ORIGINAL) without_overlap_regions = mpbs_regions.subtract(tfbs_summit_regions, whole_region=True) tfbs_regions = GenomicRegionSet("TFBS Regions") for region in iter(with_overlap_regions): region.name = region.name.split(":")[0] + ":Y" tfbs_regions.add(region) for region in iter(without_overlap_regions): region.name = region.name.split(":")[0] + ":N" tfbs_regions.add(region) tfbs_regions.sort() tfbs_fname = os.path.join(self.output_location, "{}.bed".format(self.mpbs_name)) tfbs_regions.write_bed(tfbs_fname)
def chip_evaluate(args): # Evaluate Statistics fpr = dict() tpr = dict() roc_auc_1 = dict() roc_auc_10 = dict() roc_auc_50 = dict() roc_auc_100 = dict() recall = dict() precision = dict() prc_auc_1 = dict() prc_auc_10 = dict() prc_auc_50 = dict() prc_auc_100 = dict() footprint_file = args.footprint_file.split(",") footprint_name = args.footprint_name.split(",") footprint_type = args.footprint_type.split(",") max_score = 0 if "SEG" in footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read(args.tfbs_file) # Verifying the maximum score of the MPBS file for region in iter(mpbs_regions): score = int(region.data.split("\t")[0]) if score > max_score: max_score = score max_score += 1 for i in range(len(footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read(footprint_file[i]) footprints_regions.sort() if footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data.split("\t")[0]) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(increased_score_mpbs_regions) elif footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(footprints_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(footprints_regions) # Output the statistics results into text stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix)) stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(footprint_name)): stats_file.write(footprint_name[i] + "\t" + str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" + str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n") # Output the curves if args.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix, curve_name) if args.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y, args.output_prefix, curve_name) output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
def chip_evaluate(self): """ This evaluation methodology uses motif-predicted binding sites (MPBSs) together with TF ChIP-seq data to evaluate the footprint predictions. return: """ # Evaluate Statistics fpr = dict() tpr = dict() roc_auc = dict() roc_auc_1 = dict() roc_auc_2 = dict() recall = dict() precision = dict() prc_auc = dict() if "SEG" in self.footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read_bed(self.tfbs_file) mpbs_regions.sort() # Verifying the maximum score of the MPBS file max_score = -99999999 for region in iter(mpbs_regions): score = int(region.data) if score > max_score: max_score = score max_score += 1 for i in range(len(self.footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read_bed(self.footprint_file[i]) # Sort footprint prediction bed files footprints_regions.sort() if self.footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(increased_score_mpbs_regions) elif self.footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(footprints_regions) recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(footprints_regions) # Output the statistics results into text stats_fname = self.output_location + self.tf_name + "_stats.txt" stats_header = ["METHOD", "AUC_100", "AUC_10", "AUC_1", "AUPR"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(self.footprint_name)): stats_file.write(self.footprint_name[i] + "\t" + str(roc_auc[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(roc_auc_2[i]) + "\t" + str(prc_auc[i]) + "\n") # Output the curves if self.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" self.plot_curve(fpr, tpr, roc_auc, label_x, label_y, self.tf_name, curve_name) if self.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" self.plot_curve(recall, precision, prc_auc, label_x, label_y, self.tf_name, curve_name) self.output_points(self.tf_name, fpr, tpr, recall, precision)
def chip_evaluate(args): # Evaluate Statistics fpr = dict() tpr = dict() roc_auc_1 = dict() roc_auc_10 = dict() roc_auc_50 = dict() roc_auc_100 = dict() recall = dict() precision = dict() prc_auc_1 = dict() prc_auc_10 = dict() prc_auc_50 = dict() prc_auc_100 = dict() footprint_file = args.footprint_file.split(",") footprint_name = args.footprint_name.split(",") footprint_type = args.footprint_type.split(",") max_score = 0 if "SEG" in footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read(args.tfbs_file) # Verifying the maximum score of the MPBS file for region in iter(mpbs_regions): score = int(region.data.split("\t")[0]) if score > max_score: max_score = score max_score += 1 max_points = [] for i in range(len(footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read(footprint_file[i]) footprints_regions.sort() if footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data.split("\t")[0]) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(increased_score_mpbs_regions) max_points.append(len(intersect_regions)) elif footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(footprints_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(footprints_regions) max_points.append(len(footprints_regions)) # Output the statistics results into text stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix)) stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(footprint_name)): stats_file.write(footprint_name[i] + "\t" + str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" + str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n") # Output the curves if args.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix, curve_name, max_points=max_points) if args.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y, args.output_prefix, curve_name, max_points=max_points) output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
def chip_evaluate(self): """ This evaluation methodology uses motif-predicted binding sites (MPBSs) together with TF ChIP-seq data to evaluate the footprint predictions. return: """ # Evaluate Statistics fpr = dict() tpr = dict() roc_auc = dict() roc_auc_1 = dict() roc_auc_2 = dict() recall = dict() precision = dict() prc_auc = dict() if "SEG" in self.footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read_bed(self.tfbs_file) mpbs_regions.sort() # Verifying the maximum score of the MPBS file max_score = -99999999 for region in iter(mpbs_regions): score = int(region.data) if score > max_score: max_score = score max_score += 1 for i in range(len(self.footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read_bed(self.footprint_file[i]) # Sort footprint prediction bed files footprints_regions.sort() if self.footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet( "Increased Regions") intersect_regions = mpbs_regions.intersect( footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract( footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[ i] = self.roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc[ i] = self.precision_recall_curve( increased_score_mpbs_regions) elif self.footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[ i] = self.roc_curve(footprints_regions) recall[i], precision[i], prc_auc[ i] = self.precision_recall_curve(footprints_regions) # Output the statistics results into text stats_fname = self.output_location + self.tf_name + "_stats.txt" stats_header = ["METHOD", "AUC_100", "AUC_10", "AUC_1", "AUPR"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(self.footprint_name)): stats_file.write(self.footprint_name[i] + "\t" + str(roc_auc[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(roc_auc_2[i]) + "\t" + str(prc_auc[i]) + "\n") # Output the curves if self.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" self.plot_curve(fpr, tpr, roc_auc, label_x, label_y, self.tf_name, curve_name) if self.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" self.plot_curve(recall, precision, prc_auc, label_x, label_y, self.tf_name, curve_name) self.output_points(self.tf_name, fpr, tpr, recall, precision)
print("output:\t" + args.o) if not args.t: print("Please define the file for target regions.") sys.exit(1) else: print("target:\t" + args.t) # with open(args.target) as f: t = GenomicRegionSet("targets") t.read_bed(args.t) # with open(args.i) as fi, open(args.o, "w") as fo: input_regions = GenomicRegionSet("input") input_regions.read_bed(args.i) output_regions = input_regions.subtract(t,whole_region=True) output_regions.write_bed(args.o) print("complete.") ############### BED add columns ############################################# elif args.mode == "bed_add_columns": print("input:\t" + args.i) print("reference:\t" + args.ref) print("output:\t" + args.o) if not args.ref: print("Please define the file for reference.") sys.exit(1) with open(args.ref) as f: genes = {}
def subtract(self, x): tmp = GenomicRegionSet.subtract(self, x, whole_region=False) self.sequences = self._reconstruct_info(tmp)
def subtract(self, x): tmp = GenomicRegionSet.subtract(self, x, whole_region=False) self.sequences = self._reconstruct_info(tmp)