def evidence_run(args): # Expanding summits chip_summit_regions = GenomicRegionSet("TFBS Summit Regions") chip_summit_regions.read(args.chip_file) for region in iter(chip_summit_regions): summit = int(region.data.split()[-1]) + region.initial region.initial = max(summit - (args.peak_ext / 2), 0) region.final = summit + (args.peak_ext / 2) # Calculating intersections mpbs_regions = GenomicRegionSet("MPBS Regions") mpbs_regions.read(args.mpbs_file) chip_summit_regions.sort() mpbs_regions.sort() tfbs_regions = GenomicRegionSet("TFBS Regions") for mpbs_region in mpbs_regions: if chip_summit_regions.include(mpbs_region): mpbs_region.name = mpbs_region.name.split(":")[0] + ":Y" else: mpbs_region.name = mpbs_region.name.split(":")[0] + ":N" tfbs_regions.add(mpbs_region) tfbs_regions.sort() tfbs_fname = os.path.join(args.output_location, "{}.bed".format(args.output_prefix)) tfbs_regions.write(tfbs_fname)
def create_file(self): # Expanding summits tfbs_summit_regions = GenomicRegionSet("TFBS Summit Regions") tfbs_summit_regions.read_bed(self.tfbs_summit_fname) for region in iter(tfbs_summit_regions): summit = int(region.data.split()[-1]) + region.initial region.initial = max(summit - (self.peak_ext / 2), 0) region.final = summit + (self.peak_ext / 2) # Calculating intersections mpbs_regions = GenomicRegionSet("MPBS Regions") mpbs_regions.read_bed(self.mpbs_fname) tfbs_summit_regions.sort() mpbs_regions.sort() with_overlap_regions = mpbs_regions.intersect(tfbs_summit_regions, mode=OverlapType.ORIGINAL) without_overlap_regions = mpbs_regions.subtract(tfbs_summit_regions, whole_region=True) tfbs_regions = GenomicRegionSet("TFBS Regions") for region in iter(with_overlap_regions): region.name = region.name.split(":")[0] + ":Y" tfbs_regions.add(region) for region in iter(without_overlap_regions): region.name = region.name.split(":")[0] + ":N" tfbs_regions.add(region) tfbs_regions.sort() tfbs_fname = os.path.join(self.output_location, "{}.bed".format(self.mpbs_name)) tfbs_regions.write_bed(tfbs_fname)
def fisher_table(motif_name, regions, mpbs, gene_set=False, mpbs_set=False): """ TODO Keyword arguments: motif_name -- TODO regions -- TODO mpbs -- TODO gene_set -- TODO mpbs_set -- TODO Return: a -- TODO b -- TODO gene_set -- TODO mpbs_set -- TODO """ # Fetching motif mpbs_motif = GenomicRegionSet(name="mpbs_motif") for region in mpbs.sequences: if motif_name in region.name: mpbs_motif.add(region) # Performing intersections if len(mpbs_motif) > 0: # regions which are overlapping with mpbs_motif intersect_original = regions.intersect(mpbs_motif, mode=OverlapType.ORIGINAL, rm_duplicates=True) # regions which are not overlapping with regions from mpbs_motif subtract_overlap = regions.subtract(mpbs_motif, whole_region=True) # Fetching genes if gene_set: gene_set_res = GeneSet(motif_name) for genomic_region in intersect_original.sequences: if genomic_region.name: gene_list = [e if e[0] != "." else e[1:] for e in genomic_region.name.split(":")] for g in gene_list: gene_set_res.genes.append(g) gene_set_res.genes = list(set(gene_set_res.genes)) # Keep only unique genes else: gene_set_res = None # Fetching mpbs if mpbs_set: mpbs_set_res = mpbs_motif.intersect(regions, mode=OverlapType.ORIGINAL, rm_duplicates=True) else: mpbs_set_res = None return len(intersect_original), len(subtract_overlap), gene_set_res, mpbs_set_res else: gene_set_res = GeneSet(motif_name) if gene_set else None mpbs_set_res = GenomicRegionSet(mpbs_motif.name) if mpbs_set else None return 0, len(regions), gene_set_res, mpbs_set_res
def intersect(gnrsA, gnrsB, overlap_type): # Convert to ctypes lenA = len(gnrsA) lenB = len(gnrsB) lenR = min(lenA, lenB) chromsA_python = [gr.chrom for gr in gnrsA.sequences] chromsA_c = (c_char_p * lenA)(*chromsA_python) chromsB_python = [gr.chrom for gr in gnrsB.sequences] chromsB_c = (c_char_p * lenB)(*chromsB_python) initialsA_python = [gr.initial for gr in gnrsA.sequences] initialsA_c = (c_int * lenA)(*initialsA_python) initialsB_python = [gr.initial for gr in gnrsB.sequences] initialsB_c = (c_int * lenB)(*initialsB_python) finalsA_python = [gr.final for gr in gnrsA.sequences] finalsA_c = (c_int * lenA)(*finalsA_python) finalsB_python = [gr.final for gr in gnrsB.sequences] finalsB_c = (c_int * lenB)(*finalsB_python) indices_c = POINTER(c_int)((c_int * lenR)()) initialsR_c = POINTER(c_int)((c_int * lenR)()) finalsR_c = POINTER(c_int)((c_int * lenR)()) sizeR_c = c_int() # Call C-function if overlap_type == 0: intersect_overlap_c(chromsA_c, initialsA_c, finalsA_c, lenA, chromsB_c, initialsB_c, finalsB_c, lenB, pointer(indices_c), pointer(initialsR_c), pointer(finalsR_c), byref(sizeR_c)) elif overlap_type == 1: intersect_original_c(chromsA_c, initialsA_c, finalsA_c, lenA, chromsB_c, initialsB_c, finalsB_c, lenB, pointer(indices_c), pointer(initialsR_c), pointer(finalsR_c), byref(sizeR_c)) elif overlap_type == 2: intersect_completely_included_c(chromsA_c, initialsA_c, finalsA_c, lenA, chromsB_c, initialsB_c, finalsB_c, lenB, pointer(indices_c), pointer(initialsR_c), pointer(finalsR_c), byref(sizeR_c)) result = GenomicRegionSet(gnrsA.name) for i in range(sizeR_c.value): result.add( GenomicRegion(chromsA_python[indices_c[i]], initialsR_c[i], finalsR_c[i])) return result
def initialize(name, dims, genome_path, regions, stepsize, binsize, bamfiles, exts, \ inputs, exts_inputs, factors_inputs, chrom_sizes, verbose, no_gc_content, \ tracker, debug, norm_regions, scaling_factors_ip, save_wig, housekeeping_genes): """Initialize the MultiCoverageSet""" regionset = GenomicRegionSet(name) chrom_sizes_dict = {} #if regions option is set, take the values, otherwise the whole set of #chromosomes as region to search for DPs if regions is not None: print("Call DPs on specified regions.", file=sys.stderr) with open(regions) as f: for line in f: line = line.strip() line = line.split('\t') c, s, e = line[0], int(line[1]), int(line[2]) regionset.add(GenomicRegion(chrom=c, initial=s, final=e)) chrom_sizes_dict[c] = e else: print("Call DPs on whole genome.", file=sys.stderr) with open(chrom_sizes) as f: for line in f: line = line.strip() line = line.split('\t') chrom, end = line[0], int(line[1]) regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end)) chrom_sizes_dict[chrom] = end if norm_regions: norm_regionset = GenomicRegionSet('norm_regions') norm_regionset.read_bed(norm_regions) else: norm_regionset = None if housekeeping_genes: scaling_factors_ip, _ = norm_gene_level(bamfiles, housekeeping_genes, name, verbose=True) if scaling_factors_ip: tracker.write(text=map(lambda x: str(x), scaling_factors_ip), header="Scaling factors") regionset.sequences.sort() exts, exts_inputs = _compute_extension_sizes(bamfiles, exts, inputs, exts_inputs, verbose) tracker.write(text=str(exts).strip('[]'), header="Extension size (rep1, rep2, input1, input2)") multi_cov_set = MultiCoverageSet(name=name, regions=regionset, dims=dims, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\ path_bamfiles = bamfiles, path_inputs = inputs, exts = exts, exts_inputs = exts_inputs, factors_inputs = factors_inputs, \ chrom_sizes=chrom_sizes, verbose=verbose, no_gc_content=no_gc_content, chrom_sizes_dict=chrom_sizes_dict, debug=debug, \ norm_regionset=norm_regionset, scaling_factors_ip=scaling_factors_ip, save_wig=save_wig) return multi_cov_set
def merge_delete(ext_size, merge, peak_list, pvalue_list): # peaks_gain = read_diffpeaks(path) regions_plus = GenomicRegionSet('regions') #pot. mergeable regions_minus = GenomicRegionSet('regions') #pot. mergeable regions_unmergable = GenomicRegionSet('regions') last_orientation = "" for i, t in enumerate(peak_list): chrom, start, end, c1, c2, strand, ratio = t[0], t[1], t[2], t[3], t[ 4], t[5], t[6] r = GenomicRegion(chrom = chrom, initial = start, final = end, name = '', \ orientation = strand, data = str((c1, c2, pvalue_list[i], ratio))) if end - start > ext_size: if strand == '+': if last_orientation == '+': region_plus.add(r) else: regions_unmergable.add(r) elif strand == '-': if last_orientation == '-': region_mins.add(r) else: regions_unmergable.add(r) if merge: regions_plus.extend(ext_size / 2, ext_size / 2) regions_plus.merge() regions_plus.extend(-ext_size / 2, -ext_size / 2) merge_data(regions_plus) regions_minus.extend(ext_size / 2, ext_size / 2) regions_minus.merge() regions_minus.extend(-ext_size / 2, -ext_size / 2) merge_data(regions_minus) results = GenomicRegionSet('regions') for el in regions_plus: results.add(el) for el in regions_minus: results.add(el) for el in regions_unmergable: results.add(el) results.sort() return results
def merge_delete(ext_size, merge, peak_list, pvalue_list): # peaks_gain = read_diffpeaks(path) regions_plus = GenomicRegionSet('regions') #pot. mergeable regions_minus = GenomicRegionSet('regions') #pot. mergeable regions_unmergable = GenomicRegionSet('regions') last_orientation = "" for i, t in enumerate(peak_list): chrom, start, end, c1, c2, strand, ratio = t[0], t[1], t[2], t[3], t[4], t[5], t[6] r = GenomicRegion(chrom = chrom, initial = start, final = end, name = '', \ orientation = strand, data = str((c1, c2, pvalue_list[i], ratio))) if end - start > ext_size: if strand == '+': if last_orientation == '+': region_plus.add(r) else: regions_unmergable.add(r) elif strand == '-': if last_orientation == '-': region_mins.add(r) else: regions_unmergable.add(r) if merge: regions_plus.extend(ext_size/2, ext_size/2) regions_plus.merge() regions_plus.extend(-ext_size/2, -ext_size/2) merge_data(regions_plus) regions_minus.extend(ext_size/2, ext_size/2) regions_minus.merge() regions_minus.extend(-ext_size/2, -ext_size/2) merge_data(regions_minus) results = GenomicRegionSet('regions') for el in regions_plus: results.add(el) for el in regions_minus: results.add(el) for el in regions_unmergable: results.add(el) results.sort() return results
def get_training_regionset(self): r = GenomicRegionSet('') r.add(self.regionset[self.counter]) if self.counter == len(self.chrom_sizes_dict): return None else: self.counter += 1 return r #if regions option is set, take the values, otherwise the whole set of #chromosomes as region to search for DPs # if test: # contained_chrom = ['chr1', 'chr2'] # else: # #contained_chrom = get_all_chrom(bamfiles) # contained_chrom = ['chr1', 'chr2']
def initialize(name, dims, genome_path, regions, stepsize, binsize, bamfiles, exts, \ inputs, exts_inputs, factors_inputs, chrom_sizes, verbose, no_gc_content, \ tracker, debug, norm_regions, scaling_factors_ip, save_wig): """Initialize the MultiCoverageSet""" regionset = GenomicRegionSet(name) chrom_sizes_dict = {} #if regions option is set, take the values, otherwise the whole set of #chromosomes as region to search for DPs if regions is not None: print("Call DPs on specified regions.", file=sys.stderr) with open(regions) as f: for line in f: line = line.strip() line = line.split('\t') c, s, e = line[0], int(line[1]), int(line[2]) regionset.add(GenomicRegion(chrom=c, initial=s, final=e)) chrom_sizes_dict[c] = e else: print("Call DPs on whole genome.", file=sys.stderr) with open(chrom_sizes) as f: for line in f: line = line.strip() line = line.split('\t') chrom, end = line[0], int(line[1]) regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end)) chrom_sizes_dict[chrom] = end if norm_regions: norm_regionset = GenomicRegionSet('norm_regions') norm_regionset.read_bed(norm_regions) else: norm_regionset = None regionset.sequences.sort() exts, exts_inputs = _compute_extension_sizes(bamfiles, exts, inputs, exts_inputs, verbose) tracker.write(text=str(exts).strip('[]'), header="Extension size (rep1, rep2, input1, input2)") multi_cov_set = MultiCoverageSet(name=name, regions=regionset, dims=dims, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\ path_bamfiles = bamfiles, path_inputs = inputs, exts = exts, exts_inputs = exts_inputs, factors_inputs = factors_inputs, \ chrom_sizes=chrom_sizes, verbose=verbose, no_gc_content=no_gc_content, chrom_sizes_dict=chrom_sizes_dict, debug=debug, \ norm_regionset=norm_regionset, scaling_factors_ip=scaling_factors_ip, save_wig=save_wig) return multi_cov_set
def rna_associated_gene(rna_regions, name, organism): if rna_regions: s = [ rna_regions[0][0], min([e[1] for e in rna_regions]), max([e[2] for e in rna_regions]), rna_regions[0][3] ] g = GenomicRegionSet("RNA associated genes") g.add( GenomicRegion(chrom=s[0], initial=s[1], final=s[2], name=name, orientation=s[3]) ) asso_genes = g.gene_association(organism=organism, promoterLength=1000, show_dis=True) genes = asso_genes[0].name.split(":") closest_genes = [] for n in genes: if name not in n: closest_genes.append(n) closest_genes = set(closest_genes) if len(closest_genes) == 0: return "." else: return ":".join(closest_genes) else: return "."
def create_file(self): # Expanding summits tfbs_summit_regions = GenomicRegionSet("TFBS Summit Regions") tfbs_summit_regions.read_bed(self.tfbs_summit_fname) for region in iter(tfbs_summit_regions): summit = int(region.data.split()[-1]) + region.initial region.initial = max(summit - (self.peak_ext / 2), 0) region.final = summit + (self.peak_ext / 2) # Calculating intersections mpbs_regions = GenomicRegionSet("MPBS Regions") mpbs_regions.read_bed(self.mpbs_fname) tfbs_summit_regions.sort() mpbs_regions.sort() with_overlap_regions = mpbs_regions.intersect( tfbs_summit_regions, mode=OverlapType.ORIGINAL) without_overlap_regions = mpbs_regions.subtract(tfbs_summit_regions, whole_region=True) tfbs_regions = GenomicRegionSet("TFBS Regions") for region in iter(with_overlap_regions): region.name = region.name.split(":")[0] + ":Y" tfbs_regions.add(region) for region in iter(without_overlap_regions): region.name = region.name.split(":")[0] + ":N" tfbs_regions.add(region) tfbs_regions.sort() tfbs_fname = os.path.join(self.output_location, "{}.bed".format(self.mpbs_name)) tfbs_regions.write_bed(tfbs_fname)
def chip_evaluate(args): # Evaluate Statistics fpr = dict() tpr = dict() roc_auc_1 = dict() roc_auc_10 = dict() roc_auc_50 = dict() roc_auc_100 = dict() recall = dict() precision = dict() prc_auc_1 = dict() prc_auc_10 = dict() prc_auc_50 = dict() prc_auc_100 = dict() footprint_file = args.footprint_file.split(",") footprint_name = args.footprint_name.split(",") footprint_type = args.footprint_type.split(",") max_score = 0 if "SEG" in footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read(args.tfbs_file) # Verifying the maximum score of the MPBS file for region in iter(mpbs_regions): score = int(region.data.split("\t")[0]) if score > max_score: max_score = score max_score += 1 max_points = [] for i in range(len(footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read(footprint_file[i]) footprints_regions.sort() if footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data.split("\t")[0]) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(increased_score_mpbs_regions) max_points.append(len(intersect_regions)) elif footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(footprints_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(footprints_regions) max_points.append(len(footprints_regions)) # Output the statistics results into text stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix)) stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(footprint_name)): stats_file.write(footprint_name[i] + "\t" + str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" + str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n") # Output the curves if args.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix, curve_name, max_points=max_points) if args.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y, args.output_prefix, curve_name, max_points=max_points) output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
def __iter__(self): for el in self.regionset: tmp = GenomicRegionSet('') tmp.add(el) yield tmp
print("output:\t" + args.o) print("organism:\t" + args.organism) gene = GenomicRegionSet("genes") ### Input BED file if args.i.endswith(".bed"): gene.read_bed(args.i) promoter = GenomicRegionSet("promoter") promoterLength = int(args.l) for s in gene: if s.orientation == "+": s.initial, s.final = max(s.initial-promoterLength, 0), s.initial else: s.initial, s.final = s.final, s.final+promoterLength promoter.add(s) ### Input gene list else: ann = AnnotationSet(gene_source=args.organism, alias_source=args.organism, filter_havana=False, protein_coding=False, known_only=False) de_gene = GeneSet("de genes") de_gene.read(args.i) print(len(de_gene)) promoter = ann.get_promoters(promoterLength=args.l, gene_set=de_gene, unmaplist=False) #print(len(de_prom)) #print(len(promoter)) promoter.write_bed(args.o)
ints = [gr.initial for gr in gnrsB.sequences] initialsB = (c_int * len(ints))(*ints) ints = [gr.final for gr in gnrsA.sequences] finalsA = (c_int * len(ints))(*ints) ints = [gr.final for gr in gnrsB.sequences] finalsB = (c_int * len(ints))(*ints) # Call C-function return jaccardC(chromsA, initialsA, finalsA, len(gnrsA), chromsB, initialsB, finalsB, len(gnrsB)) set1 = GenomicRegionSet("A") set1.add(GenomicRegion("chr1", 0, 10)) set1.add(GenomicRegion("chr1", 15, 20)) set1.add(GenomicRegion("chr1", 30, 45)) print(set1.sequences) set2 = GenomicRegionSet("B") set2.add(GenomicRegion("chr1", 0, 5)) set2.add(GenomicRegion("chr1", 10, 25)) set2.add(GenomicRegion("chr1", 35, 45)) print(set2.sequences) jaccard2 = jaccardIndex(set1, set2) print("jaccard2", jaccard2) def intersect(gnrsA, gnrsB, overlap_type): # Convert to ctypes
def initialize(name, genome_path, regions, stepsize, binsize, bam_file_1, bam_file_2, ext_1, ext_2, \ input_1, input_factor_1, ext_input_1, input_2, input_factor_2, ext_input_2, chrom_sizes, verbose, norm_strategy, no_gc_content, deadzones,\ factor_input_1, factor_input_2, debug, tracker): regionset = GenomicRegionSet(name) chrom_sizes_dict = {} #if regions option is set, take the values, otherwise the whole set of #chromosomes as region to search for DPs if regions is not None: with open(regions) as f: for line in f: line = line.strip() line = line.split('\t') c, s, e = line[0], int(line[1]), int(line[2]) regionset.add(GenomicRegion(chrom=c, initial=s, final=e)) chrom_sizes_dict[c] = e else: with open(chrom_sizes) as f: for line in f: line = line.strip() line = line.split('\t') chrom, end = line[0], int(line[1]) regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end)) chrom_sizes_dict[chrom] = end regionset.sequences.sort() start = 0 end = 600 ext_stepsize = 5 #TODO: maybe for-loops? #compute extension size if [ext_1, ext_2, ext_input_1, ext_input_2].count(None) > 0: print("Computing read extension sizes...", file=sys.stderr) if ext_1 is None: ext_1, values_1 = get_extension_size(bam_file_1, start=start, end=end, stepsize=ext_stepsize) print("Read extension for first file: %s" % ext_1, file=sys.stderr) if ext_2 is None: ext_2, values_2 = get_extension_size(bam_file_2, start=start, end=end, stepsize=ext_stepsize) print("Read extension for second file: %s" % ext_2, file=sys.stderr) if input_1 is not None and ext_input_1 is None: ext_input_1, values_input_1 = get_extension_size(input_1, start=start, end=end, stepsize=ext_stepsize) print("Read extension for first input file: %s" % ext_input_1, file=sys.stderr) if input_1 is not None and input_2 is not None and input_1 == input_2 and 'ext_input_1' in locals( ) and 'values_input_1' in locals(): ext_input_2, values_input_2 = ext_input_1, values_input_1 elif input_2 is not None and ext_input_2 is None: ext_input_2, values_input_2 = get_extension_size(input_2, start=start, end=end, stepsize=ext_stepsize) print("Read extension for second input file: %s" % ext_input_2, file=sys.stderr) tracker.write(text=str(ext_1) + "," + str(ext_2), header="Extension size IP1, IP2") if input_1 is not None and input_2 is not None: tracker.write(text=str(ext_input_1) + "," + str(ext_input_2), header="Extension size Control1, Control2") if verbose: if 'values_1' in locals() and values_1 is not None: with open(name + '-read-ext-1', 'w') as f: for v, i in values_1: print(i, v, sep='\t', file=f) if 'values_2' in locals() and values_2 is not None: with open(name + '-read-ext-2', 'w') as f: for v, i in values_2: print(i, v, sep='\t', file=f) if 'values_input_1' in locals() and values_input_1 is not None: with open(name + '-read-ext-input-1', 'w') as f: for v, i in values_input_1: print(i, v, sep='\t', file=f) if 'values_input_2' in locals() and values_input_2 is not None: with open(name + '-read-ext-input-2', 'w') as f: for v, i in values_input_2: print(i, v, sep='\t', file=f) cov_cdp_mpp = DualCoverageSet(name=name, region=regionset, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\ file_1=bam_file_1, ext_1=ext_1,\ file_2=bam_file_2, ext_2=ext_2, \ input_1=input_1, ext_input_1=ext_input_1, input_factor_1=input_factor_1, \ input_2=input_2, ext_input_2=ext_input_2, input_factor_2=input_factor_2, \ chrom_sizes=chrom_sizes, verbose=verbose, norm_strategy=norm_strategy, no_gc_content=no_gc_content, deadzones=deadzones,\ factor_input_1=factor_input_1, factor_input_2=factor_input_2, chrom_sizes_dict=chrom_sizes_dict, debug=debug, tracker=tracker) return cov_cdp_mpp, [ext_1, ext_2]
def dbd_regions(exons, sig_region, rna_name, output,out_file=False, temp=None, fasta=True): """Generate the BED file of significant DBD regions and FASTA file of the sequences""" if len(sig_region) == 0: return #print(self.rna_regions) if not exons: pass else: dbd = GenomicRegionSet("DBD") dbdmap = {} if len(exons) == 1: print("## Warning: No information of exons in the given RNA sequence, the DBD position may be problematic. ") for rbs in sig_region: loop = True if exons[0][3] == "-": while loop: cf = 0 for exon in exons: #print(exon) l = abs(exon[2] - exon[1]) tail = cf + l if cf <= rbs.initial <= tail: dbdstart = exon[2] - rbs.initial + cf if rbs.final <= tail: #print("1") dbdend = exon[2] - rbs.final + cf if dbdstart > dbdend: dbdstart, dbdend = dbdend, dbdstart dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=dbdend, orientation=exons[0][3], name=str(rbs.initial)+"-"+str(rbs.final) ) ) dbdmap[str(rbs)] = dbd[-1].toString() + " strand:-" loop = False break elif rbs.final > tail: subtract = l + cf - rbs.initial #print("2") #print("Subtract: "+str(subtract)) if dbdstart > exon[1]: dbdstart, exon[1] = exon[1], dbdstart dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=exon[1], orientation=exons[0][3], name=str(rbs.initial)+"-"+str(rbs.initial+subtract)+"_split1" ) ) elif rbs.initial < cf and rbs.final <= tail: #print("3") dbdstart = exon[2] dbdend = exon[2] - rbs.final + rbs.initial + subtract if dbdstart > dbdend: dbdstart, dbdend = dbdend, dbdstart dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=dbdend, orientation=exons[0][3], name=str(cf)+"-"+str(rbs.final)+"_split2" ) ) dbdmap[str(rbs)] = dbd[-2].toString() + " & " + dbd[-1].toString() + " strand:-" loop = False break elif rbs.initial > tail: pass cf += l loop = False else: while loop: cf = 0 for exon in exons: #print(exon) l = exon[2] - exon[1] tail = cf + l #print("cf: " + str(cf)) #print("tail: " + str(tail) ) if cf <= rbs.initial <= tail: dbdstart = exon[1] + rbs.initial - cf if rbs.final <= tail: #print("1") dbdend = exon[1] + rbs.final -cf dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=dbdend, orientation=exons[0][3], name=str(rbs.initial)+"-"+str(rbs.final) ) ) dbdmap[str(rbs)] = dbd[-1].toString() + " strand:+" loop = False break elif rbs.final > tail: subtract = l + cf - rbs.initial #print("2") #print("Subtract: "+str(subtract)) dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=exon[2], orientation=exons[0][3], name=str(rbs.initial)+"-"+str(rbs.initial+subtract)+"_split1" ) ) elif rbs.initial < cf and rbs.final <= tail: #print("3") dbdstart = exon[1] dbdend = exon[1] + rbs.final - rbs.initial - subtract dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=dbdend, orientation=exons[0][3], name=str(cf)+"-"+str(rbs.final)+"_split2" ) ) dbdmap[str(rbs)] = dbd[-2].toString() + " & " + dbd[-1].toString() + " strand:+" loop = False break elif rbs.initial > tail: pass cf += l loop = False if not out_file: dbd.write_bed(filename=os.path.join(output, "DBD_"+rna_name+".bed")) else: # print(dbd) # print(dbd.sequences[0]) dbd.write_bed(filename=output) # FASTA if fasta: #print(dbdmap) if not out_file: seq = pysam.Fastafile(os.path.join(output,"rna_temp.fa")) fasta_f = os.path.join(output, "DBD_"+rna_name+".fa") else: seq = pysam.Fastafile(os.path.join(temp,"rna_temp.fa")) fasta_f = output+".fa" with open(fasta_f, 'w') as fasta: for rbs in sig_region: print(">"+ rna_name +":"+str(rbs.initial)+"-"+str(rbs.final), file=fasta) s = seq.fetch(rbs.chrom, max(0, rbs.initial), rbs.final) for ss in [s[i:i + 80] for i in range(0, len(s), 80)]: print(ss, file=fasta)
def chip_evaluate(self): """ This evaluation methodology uses motif-predicted binding sites (MPBSs) together with TF ChIP-seq data to evaluate the footprint predictions. return: """ # Evaluate Statistics fpr = dict() tpr = dict() roc_auc = dict() roc_auc_1 = dict() roc_auc_2 = dict() recall = dict() precision = dict() prc_auc = dict() if "SEG" in self.footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read_bed(self.tfbs_file) mpbs_regions.sort() # Verifying the maximum score of the MPBS file max_score = -99999999 for region in iter(mpbs_regions): score = int(region.data) if score > max_score: max_score = score max_score += 1 for i in range(len(self.footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read_bed(self.footprint_file[i]) # Sort footprint prediction bed files footprints_regions.sort() if self.footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(increased_score_mpbs_regions) elif self.footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(footprints_regions) recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(footprints_regions) # Output the statistics results into text stats_fname = self.output_location + self.tf_name + "_stats.txt" stats_header = ["METHOD", "AUC_100", "AUC_10", "AUC_1", "AUPR"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(self.footprint_name)): stats_file.write(self.footprint_name[i] + "\t" + str(roc_auc[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(roc_auc_2[i]) + "\t" + str(prc_auc[i]) + "\n") # Output the curves if self.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" self.plot_curve(fpr, tpr, roc_auc, label_x, label_y, self.tf_name, curve_name) if self.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" self.plot_curve(recall, precision, prc_auc, label_x, label_y, self.tf_name, curve_name) self.output_points(self.tf_name, fpr, tpr, recall, precision)
def gen_html(self, directory, parameters, obed, align=50, alpha=0.05, score=False): """Generate the HTML file""" dir_name = os.path.basename(directory) html_header = "Genomic Region Test: " + dir_name link_ds = OrderedDict() link_ds["RNA"] = "index.html" link_ds["Sig Target Regions"] = "starget_regions.html" link_ds["Target Regions"] = "target_regions.html" link_ds["Parameters"] = "parameters.html" ################################################## # index.html html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Plots html.add_figure("lineplot_region.png", align="left", width="45%", more_images=["boxplot_regions.png"]) if self.showdbs: html.add_figure("lineplot_dbs.png", align="left", width="45%", more_images=["boxplot_dbs.png"]) if self.showdbs: header_list = [["#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", "Target Regions", "Non-target Regions", None, "Statistics"], ["", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "NO. DBSs", "NO. DBSs (average)", "s.d.", "<i>p</i>-value"]] header_titles = [["Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", "Given target regions on DNA", "Regions from randomization", None, "Statistics based on DNA Binding Sites"], ["", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Number of related DNA Binding Sites binding to target regions", "Average number of DNA Binding Sites binding to random regions", "Standard deviation", "P-value"]] border_list = [" style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:2pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\""] else: header_list = [["#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", None], ["", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "z-score"]] header_titles = [["Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", None], ["", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Z-score"]] border_list = [" style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", ""] type_list = 'ssssssssssssssss' col_size_list = [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50] data_table = [] for i, rbs in enumerate(self.rbss): if self.data["region"]["p"][i] < alpha: p_region = "<font color=\"red\">" + value2str(self.data["region"]["p"][i]) + "</font>" else: p_region = value2str(self.data["region"]["p"][i]) zs = (self.counts_tr[rbs][0] - self.data["region"]["ave"][i]) / self.data["region"]["sd"][i] new_line = [str(i + 1), rbs.str_rna(pa=False), '<a href="dbd_region.html#' + rbs.str_rna() + '" style="text-align:left">' + str(self.counts_tr[rbs][0]) + '</a>', str(self.counts_tr[rbs][1]), value2str(self.data["region"]["ave"][i]), value2str(self.data["region"]["sd"][i]), p_region, value2str(zs)] if self.showdbs: if self.data["dbs"]["p"][i] < alpha: p_dbs = "<font color=\"red\">" + value2str(self.data["dbs"]["p"][i]) + "</font>" else: p_dbs = value2str(self.data["dbs"]["p"][i]) new_line += [str(self.counts_dbs[rbs]), value2str(self.data["dbs"]["ave"][i]), value2str(self.data["dbs"]["sd"][i]), p_dbs] data_table.append(new_line) data_table = natsort.natsorted(data_table, key=lambda x: x[6]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, border_list=border_list, sortable=True) html.add_heading("Notes") html.add_list(["RNA name: " + self.rna_name, "Randomization is performed for " + str(self.repeats) + " times.", "DBD stands for DNA Binding Domain on RNA.", "DBS stands for DNA Binding Site on DNA."]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "index.html")) ############################################################# # RNA subpage: Profile of targeted regions for each merged DNA Binding Domain ############################################################# header_list = ["#", "Target Region", "Associated Gene", "No. of DBSs", "DBS coverage"] header_titles = ["Rank", "Given target regions from BED files", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites locate within the region", "The proportion of the region covered by DBS binding"] ######################################################### # dbd_region.html html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for rbsm in self.rbss: html.add_heading("DNA Binding Domain: " + rbsm.str_rna(), idtag=rbsm.str_rna()) data_table = [] for i, region in enumerate(self.txp.merged_dict[rbsm]): # Add information data_table.append([str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), str(len(self.region_dbs[region.toString()])), value2str(self.region_coverage[region.toString()]) ]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "dbd_region.html")) ############################################################# # Targeted regions centered ############################################################# ############################################################################################## # target_regions.html html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") if score: header_list = ["#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Score", "Sum of ranks"] header_titles = ["Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Scores from BED file", "Sum of all the left-hand-side ranks"] else: header_list = ["#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Sum of ranks"] header_titles = ["Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Sum of all the left-hand-side ranks"] html.add_heading("Target Regions") data_table = [] if not self.dna_region.sorted: self.dna_region.sort() # Calculate the ranking rank_count = len(self.dna_region) - rank_array([len(self.region_dbs[p.toString()]) for p in self.dna_region]) rank_coverage = len(self.dna_region) - rank_array([self.region_coverage[p.toString()] for p in self.dna_region]) if score: try: score_list = [float(p.data.split("\t")[0]) for p in self.dna_region] rank_score = len(self.dna_region) - rank_array([abs(s) for s in score_list]) rank_sum = [x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score)] # sum_rank = rank_array(rank_sum) # method='min' except ImportError: print("There is no score in BED file, please don't use '-score' argument.") else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(self.dna_region): dbs_counts = str(len(self.region_dbs[region.toString()])) dbs_cover = value2str(self.region_coverage[region.toString()]) newline = [str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), '<a href="region_dbs.html#' + region.toString() + '" style="text-align:left">' + dbs_counts + '</a>', dbs_cover] if score: dbs_score = value2str(score_list[i]) region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(rank_sum[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) else: region.data = "\t".join([dbs_counts, dbs_cover, str(rank_sum[i])]) newline.append(str(rank_sum[i])) data_table.append(newline) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) # data_table = sorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_heading("Notes") html.add_list(["All target regions without any bindings are ignored."]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "target_regions.html")) self.dna_region.sort_score() self.dna_region.write_bed(os.path.join(directory, obed + "_target_regions.bed")) ############################################################################################## # starget_regions.html for significant target regions stargets = GenomicRegionSet("sig_targets") sig_dbs = {} sig_dbs_coverage = {} for i, r in enumerate(self.dna_region): sig_bindings = self.region_dbs[r.toString()].overlap_rbss(rbss=self.data["region"]["sig_region"]) dbs = sig_bindings.get_dbs() if len(dbs) > 0: stargets.add(r) m_dbs = dbs.merge(w_return=True) sig_dbs[r] = len(dbs) # self.promoter["de"]["merged_dbs"][promoter.toString()] = len(m_dbs) sig_dbs_coverage[r] = float(m_dbs.total_coverage()) / len(r) html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Select promoters in sig DBD if len(self.data["region"]["sig_region"]) == 0: html.add_heading("There is no significant DBD.") else: html.add_heading("Target regions bound by significant DBD") data_table = [] # Calculate the ranking rank_count = len(stargets) - rank_array([sig_dbs[p] for p in stargets]) rank_coverage = len(stargets) - rank_array([sig_dbs_coverage[p] for p in stargets]) if score: score_list = [float(p.data.split("\t")[0]) for p in stargets] rank_score = len(stargets) - rank_array([abs(s) for s in score_list]) rank_sum = [x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score)] sum_rank = rank_array(rank_sum) # method='min' else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(stargets): dbssount = '<a href="region_dbs.html#' + region.toString() + \ '" style="text-align:left">' + str(sig_dbs[region]) + '</a>' region_link = region_link_internet(self.organism, region) newline = [str(i + 1), region_link, split_gene_name(gene_name=region.name, org=self.organism), dbssount, value2str(sig_dbs_coverage[region]) ] if score: dbs_score = value2str(score_list[i]) # region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(sum_rank[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) # print([dbs_score, str(sum_rank[i])]) else: # region.data = "\t".join([dbs_counts, dbs_cover, str(sum_rank[i])]) newline.append(str(rank_sum[i])) # newline += ["<i>" + str(rank_sum[i]) + "</i>"] # print(newline) data_table.append(newline) # print(data_table) # data_table = sorted(data_table, key=lambda x: x[-1]) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", header_titles=header_titles, border_list=None, sortable=True) html.add_heading("Notes") html.add_list(["DBS stands for DNA Binding Site on DNA.", "DBS coverage is the proportion of the region where has potential to form triple helices with the given RNA."]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "starget_regions.html")) ############################ # Subpages for targeted region centered page # region_dbs.html header_list = ["RBS", "DBS", "Strand", "Score", "Motif", "Orientation"] html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for i, region in enumerate(self.dna_region): if len(self.region_dbs[region.toString()]) == 0: continue else: html.add_heading("Associated gene: " + split_gene_name(gene_name=region.name, org=self.organism), idtag=region.toString()) html.add_free_content(['<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="margin-left:50">' + region.toString(space=True) + '</a>']) data_table = [] for rd in self.region_dbs[region.toString()]: rbs = rd.rna.str_rna(pa=False) for rbsm in self.data["region"]["sig_region"]: # rbsm = rbsm.partition(":")[2].split("-") if rd.rna.overlap(rbsm): rbs = "<font color=\"red\">" + rbs + "</font>" data_table.append([rbs, '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + rd.dna.chrom + "%3A" + str(rd.dna.initial) + "-" + str( rd.dna.final) + '" style="text-align:left">' + rd.dna.toString(space=True) + '</a>', rd.dna.orientation, rd.score, rd.motif, rd.orient]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.write(os.path.join(directory, "region_dbs.html")) ###############################################################################33 ################ Parameters.html html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") html.add_heading("Parameters") header_list = ["Description", "Arguments", "Value"] data_table = [["RNA sequence name", "-rn", parameters.rn], ["Input RNA sequence file", "-r", os.path.basename(parameters.r)], ["Input BED file", "-bed", os.path.basename(parameters.bed)], ["Output directory", "-o", os.path.basename(parameters.o)], ["Organism", "-organism", parameters.organism], ["Number of repitetion of andomization", "-n", str(parameters.n)], ["Alpha level for rejection p value", "-a", str(parameters.a)], ["Cut off value for filtering out the low counts of DBSs", "-ccf", str(parameters.ccf)], ["Remove temporary files", "-rt", str(parameters.rt)], ["Input BED file for masking in randomization", "-f", str(parameters.f)], ["Input file for RNA accecibility", "-ac", str(parameters.ac)], ["Cut off value for RNA accecibility", "-accf", str(parameters.accf)], ["Output the BED files for DNA binding sites.", "-obed", str(parameters.obed)], ["Show parallel and antiparallel bindings in the plot separately.", "-showpa", str(parameters.showpa)], ["Minimum length", "-l", str(self.triplexator_p[0])], ["Maximum error rate", "-e", str(self.triplexator_p[1])], ["Tolerated number of consecutive errors", "-c", str(self.triplexator_p[2])], ["Filtering repeats", "-fr", str(self.triplexator_p[3])], ["Filtering mode", "-fm", str(self.triplexator_p[4])], ["Output format", "-of", str(self.triplexator_p[5])], ["Merge features", "-mf", str(self.triplexator_p[6])]] html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.add_free_content(['<a href="summary.txt" style="margin-left:100">See details</a>']) html.write(os.path.join(directory, "parameters.html"))
def gen_html(self, directory, parameters, obed, align=50, alpha=0.05, score=False): """Generate the HTML file""" dir_name = os.path.basename(directory) html_header = "Genomic Region Test: " + dir_name link_ds = OrderedDict() link_ds["RNA"] = "index.html" link_ds["Sig Target Regions"] = "starget_regions.html" link_ds["Target Regions"] = "target_regions.html" link_ds["Parameters"] = "parameters.html" ################################################## # index.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Plots html.add_figure("lineplot_region.png", align="left", width="45%", more_images=["boxplot_regions.png"]) if self.showdbs: html.add_figure("lineplot_dbs.png", align="left", width="45%", more_images=["boxplot_dbs.png"]) if self.showdbs: header_list = [[ "#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", "Target Regions", "Non-target Regions", None, "Statistics" ], [ "", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "NO. DBSs", "NO. DBSs (average)", "s.d.", "<i>p</i>-value" ]] header_titles = [ [ "Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", "Given target regions on DNA", "Regions from randomization", None, "Statistics based on DNA Binding Sites" ], [ "", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Number of related DNA Binding Sites binding to target regions", "Average number of DNA Binding Sites binding to random regions", "Standard deviation", "P-value" ] ] border_list = [ " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:2pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"" ] else: header_list = [[ "#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", None ], [ "", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "z-score" ]] header_titles = [ [ "Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", None ], [ "", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Z-score" ] ] border_list = [ " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "" ] type_list = 'ssssssssssssssss' col_size_list = [ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50 ] data_table = [] for i, rbs in enumerate(self.rbss): if self.data["region"]["p"][i] < alpha: p_region = "<font color=\"red\">" + value2str( self.data["region"]["p"][i]) + "</font>" else: p_region = value2str(self.data["region"]["p"][i]) zs = (self.counts_tr[rbs][0] - self.data["region"]["ave"][i]) / self.data["region"]["sd"][i] new_line = [ str(i + 1), rbs.str_rna(pa=False), '<a href="dbd_region.html#' + rbs.str_rna() + '" style="text-align:left">' + str(self.counts_tr[rbs][0]) + '</a>', str(self.counts_tr[rbs][1]), value2str(self.data["region"]["ave"][i]), value2str(self.data["region"]["sd"][i]), p_region, value2str(zs) ] if self.showdbs: if self.data["dbs"]["p"][i] < alpha: p_dbs = "<font color=\"red\">" + value2str( self.data["dbs"]["p"][i]) + "</font>" else: p_dbs = value2str(self.data["dbs"]["p"][i]) new_line += [ str(self.counts_dbs[rbs]), value2str(self.data["dbs"]["ave"][i]), value2str(self.data["dbs"]["sd"][i]), p_dbs ] data_table.append(new_line) data_table = natsort.natsorted(data_table, key=lambda x: x[6]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, border_list=border_list, sortable=True) html.add_heading("Notes") html.add_list([ "RNA name: " + self.rna_name, "Randomization is performed for " + str(self.repeats) + " times.", "DBD stands for DNA Binding Domain on RNA.", "DBS stands for DNA Binding Site on DNA." ]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "index.html")) ############################################################# # RNA subpage: Profile of targeted regions for each merged DNA Binding Domain ############################################################# header_list = [ "#", "Target Region", "Associated Gene", "No. of DBSs", "DBS coverage" ] header_titles = [ "Rank", "Given target regions from BED files", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites locate within the region", "The proportion of the region covered by DBS binding" ] ######################################################### # dbd_region.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for rbsm in self.rbss: html.add_heading("DNA Binding Domain: " + rbsm.str_rna(), idtag=rbsm.str_rna()) data_table = [] for i, region in enumerate(self.txp.merged_dict[rbsm]): # Add information data_table.append([ str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), str(len(self.region_dbs[region.toString()])), value2str(self.region_coverage[region.toString()]) ]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "dbd_region.html")) ############################################################# # Targeted regions centered ############################################################# ############################################################################################## # target_regions.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") if score: header_list = [ "#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Score", "Sum of ranks" ] header_titles = [ "Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Scores from BED file", "Sum of all the left-hand-side ranks" ] else: header_list = [ "#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Sum of ranks" ] header_titles = [ "Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Sum of all the left-hand-side ranks" ] html.add_heading("Target Regions") data_table = [] if not self.dna_region.sorted: self.dna_region.sort() # Calculate the ranking rank_count = len(self.dna_region) - rank_array( [len(self.region_dbs[p.toString()]) for p in self.dna_region]) rank_coverage = len(self.dna_region) - rank_array( [self.region_coverage[p.toString()] for p in self.dna_region]) if score: try: score_list = [ float(p.data.split("\t")[0]) for p in self.dna_region ] rank_score = len(self.dna_region) - rank_array( [abs(s) for s in score_list]) rank_sum = [ x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score) ] # sum_rank = rank_array(rank_sum) # method='min' except ImportError: print( "There is no score in BED file, please don't use '-score' argument." ) else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(self.dna_region): dbs_counts = str(len(self.region_dbs[region.toString()])) dbs_cover = value2str(self.region_coverage[region.toString()]) newline = [ str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), '<a href="region_dbs.html#' + region.toString() + '" style="text-align:left">' + dbs_counts + '</a>', dbs_cover ] if score: dbs_score = value2str(score_list[i]) region.data = "\t".join( [dbs_counts, dbs_cover, dbs_score, str(rank_sum[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) else: region.data = "\t".join( [dbs_counts, dbs_cover, str(rank_sum[i])]) newline.append(str(rank_sum[i])) data_table.append(newline) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) # data_table = sorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_heading("Notes") html.add_list(["All target regions without any bindings are ignored."]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "target_regions.html")) self.dna_region.sort_score() self.dna_region.write_bed( os.path.join(directory, obed + "_target_regions.bed")) ############################################################################################## # starget_regions.html for significant target regions stargets = GenomicRegionSet("sig_targets") sig_dbs = {} sig_dbs_coverage = {} for i, r in enumerate(self.dna_region): sig_bindings = self.region_dbs[r.toString()].overlap_rbss( rbss=self.data["region"]["sig_region"]) dbs = sig_bindings.get_dbs() if len(dbs) > 0: stargets.add(r) m_dbs = dbs.merge(w_return=True) sig_dbs[r] = len(dbs) # self.promoter["de"]["merged_dbs"][promoter.toString()] = len(m_dbs) sig_dbs_coverage[r] = float(m_dbs.total_coverage()) / len(r) html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Select promoters in sig DBD if len(self.data["region"]["sig_region"]) == 0: html.add_heading("There is no significant DBD.") else: html.add_heading("Target regions bound by significant DBD") data_table = [] # Calculate the ranking rank_count = len(stargets) - rank_array( [sig_dbs[p] for p in stargets]) rank_coverage = len(stargets) - rank_array( [sig_dbs_coverage[p] for p in stargets]) if score: score_list = [float(p.data.split("\t")[0]) for p in stargets] rank_score = len(stargets) - rank_array( [abs(s) for s in score_list]) rank_sum = [ x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score) ] sum_rank = rank_array(rank_sum) # method='min' else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(stargets): dbssount = '<a href="region_dbs.html#' + region.toString() + \ '" style="text-align:left">' + str(sig_dbs[region]) + '</a>' region_link = region_link_internet(self.organism, region) newline = [ str(i + 1), region_link, split_gene_name(gene_name=region.name, org=self.organism), dbssount, value2str(sig_dbs_coverage[region]) ] if score: dbs_score = value2str(score_list[i]) # region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(sum_rank[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) # print([dbs_score, str(sum_rank[i])]) else: # region.data = "\t".join([dbs_counts, dbs_cover, str(sum_rank[i])]) newline.append(str(rank_sum[i])) # newline += ["<i>" + str(rank_sum[i]) + "</i>"] # print(newline) data_table.append(newline) # print(data_table) # data_table = sorted(data_table, key=lambda x: x[-1]) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", header_titles=header_titles, border_list=None, sortable=True) html.add_heading("Notes") html.add_list([ "DBS stands for DNA Binding Site on DNA.", "DBS coverage is the proportion of the region where has potential to form triple helices with the given RNA." ]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "starget_regions.html")) ############################ # Subpages for targeted region centered page # region_dbs.html header_list = ["RBS", "DBS", "Strand", "Score", "Motif", "Orientation"] html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for i, region in enumerate(self.dna_region): if len(self.region_dbs[region.toString()]) == 0: continue else: html.add_heading( "Associated gene: " + split_gene_name(gene_name=region.name, org=self.organism), idtag=region.toString()) html.add_free_content([ '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="margin-left:50">' + region.toString(space=True) + '</a>' ]) data_table = [] for rd in self.region_dbs[region.toString()]: rbs = rd.rna.str_rna(pa=False) for rbsm in self.data["region"]["sig_region"]: # rbsm = rbsm.partition(":")[2].split("-") if rd.rna.overlap(rbsm): rbs = "<font color=\"red\">" + rbs + "</font>" data_table.append([ rbs, '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + rd.dna.chrom + "%3A" + str(rd.dna.initial) + "-" + str(rd.dna.final) + '" style="text-align:left">' + rd.dna.toString(space=True) + '</a>', rd.dna.orientation, rd.score, rd.motif, rd.orient ]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.write(os.path.join(directory, "region_dbs.html")) ###############################################################################33 ################ Parameters.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") html.add_heading("Parameters") header_list = ["Description", "Arguments", "Value"] data_table = [ ["RNA sequence name", "-rn", parameters.rn], ["Input RNA sequence file", "-r", os.path.basename(parameters.r)], ["Input BED file", "-bed", os.path.basename(parameters.bed)], ["Output directory", "-o", os.path.basename(parameters.o)], ["Organism", "-organism", parameters.organism], ["Number of repitetion of andomization", "-n", str(parameters.n)], ["Alpha level for rejection p value", "-a", str(parameters.a)], [ "Cut off value for filtering out the low counts of DBSs", "-ccf", str(parameters.ccf) ], ["Remove temporary files", "-rt", str(parameters.rt)], [ "Input BED file for masking in randomization", "-f", str(parameters.f) ], ["Input file for RNA accecibility", "-ac", str(parameters.ac)], [ "Cut off value for RNA accecibility", "-accf", str(parameters.accf) ], [ "Output the BED files for DNA binding sites.", "-obed", str(parameters.obed) ], [ "Show parallel and antiparallel bindings in the plot separately.", "-showpa", str(parameters.showpa) ], ["Minimum length", "-l", str(self.triplexator_p[0])], ["Maximum error rate", "-e", str(self.triplexator_p[1])], [ "Tolerated number of consecutive errors", "-c", str(self.triplexator_p[2]) ], ["Filtering repeats", "-fr", str(self.triplexator_p[3])], ["Filtering mode", "-fm", str(self.triplexator_p[4])], ["Output format", "-of", str(self.triplexator_p[5])], ["Merge features", "-mf", str(self.triplexator_p[6])] ] html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.add_free_content( ['<a href="summary.txt" style="margin-left:100">See details</a>']) html.write(os.path.join(directory, "parameters.html"))
def chip_evaluate(self): """ This evaluation methodology uses motif-predicted binding sites (MPBSs) together with TF ChIP-seq data to evaluate the footprint predictions. return: """ # Evaluate Statistics fpr = dict() tpr = dict() roc_auc = dict() roc_auc_1 = dict() roc_auc_2 = dict() recall = dict() precision = dict() prc_auc = dict() if "SEG" in self.footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read_bed(self.tfbs_file) mpbs_regions.sort() # Verifying the maximum score of the MPBS file max_score = -99999999 for region in iter(mpbs_regions): score = int(region.data) if score > max_score: max_score = score max_score += 1 for i in range(len(self.footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read_bed(self.footprint_file[i]) # Sort footprint prediction bed files footprints_regions.sort() if self.footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet( "Increased Regions") intersect_regions = mpbs_regions.intersect( footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract( footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[ i] = self.roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc[ i] = self.precision_recall_curve( increased_score_mpbs_regions) elif self.footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[ i] = self.roc_curve(footprints_regions) recall[i], precision[i], prc_auc[ i] = self.precision_recall_curve(footprints_regions) # Output the statistics results into text stats_fname = self.output_location + self.tf_name + "_stats.txt" stats_header = ["METHOD", "AUC_100", "AUC_10", "AUC_1", "AUPR"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(self.footprint_name)): stats_file.write(self.footprint_name[i] + "\t" + str(roc_auc[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(roc_auc_2[i]) + "\t" + str(prc_auc[i]) + "\n") # Output the curves if self.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" self.plot_curve(fpr, tpr, roc_auc, label_x, label_y, self.tf_name, curve_name) if self.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" self.plot_curve(recall, precision, prc_auc, label_x, label_y, self.tf_name, curve_name) self.output_points(self.tf_name, fpr, tpr, recall, precision)
def chip_evaluate(args): # Evaluate Statistics fpr = dict() tpr = dict() roc_auc_1 = dict() roc_auc_10 = dict() roc_auc_50 = dict() roc_auc_100 = dict() recall = dict() precision = dict() prc_auc_1 = dict() prc_auc_10 = dict() prc_auc_50 = dict() prc_auc_100 = dict() footprint_file = args.footprint_file.split(",") footprint_name = args.footprint_name.split(",") footprint_type = args.footprint_type.split(",") max_score = 0 if "SEG" in footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read(args.tfbs_file) # Verifying the maximum score of the MPBS file for region in iter(mpbs_regions): score = int(region.data.split("\t")[0]) if score > max_score: max_score = score max_score += 1 for i in range(len(footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read(footprint_file[i]) footprints_regions.sort() if footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data.split("\t")[0]) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(increased_score_mpbs_regions) elif footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(footprints_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(footprints_regions) # Output the statistics results into text stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix)) stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(footprint_name)): stats_file.write(footprint_name[i] + "\t" + str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" + str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n") # Output the curves if args.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix, curve_name) if args.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y, args.output_prefix, curve_name) output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
def initialize(name, genome_path, regions, stepsize, binsize, bam_file_1, bam_file_2, ext_1, ext_2, \ input_1, input_factor_1, ext_input_1, input_2, input_factor_2, ext_input_2, chrom_sizes, verbose, norm_strategy, no_gc_content, deadzones,\ factor_input_1, factor_input_2, debug, tracker): regionset = GenomicRegionSet(name) chrom_sizes_dict = {} #if regions option is set, take the values, otherwise the whole set of #chromosomes as region to search for DPs if regions is not None: with open(regions) as f: for line in f: line = line.strip() line = line.split('\t') c, s, e = line[0], int(line[1]), int(line[2]) regionset.add(GenomicRegion(chrom=c, initial=s, final=e)) chrom_sizes_dict[c] = e else: with open(chrom_sizes) as f: for line in f: line = line.strip() line = line.split('\t') chrom, end = line[0], int(line[1]) regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end)) chrom_sizes_dict[chrom] = end regionset.sequences.sort() start = 0 end = 600 ext_stepsize = 5 #TODO: maybe for-loops? #compute extension size if [ext_1, ext_2, ext_input_1, ext_input_2].count(None) > 0: print("Computing read extension sizes...", file=sys.stderr) if ext_1 is None: ext_1, values_1 = get_extension_size(bam_file_1, start=start, end=end, stepsize=ext_stepsize) print("Read extension for first file: %s" %ext_1, file=sys.stderr) if ext_2 is None: ext_2, values_2 = get_extension_size(bam_file_2, start=start, end=end, stepsize=ext_stepsize) print("Read extension for second file: %s" %ext_2, file=sys.stderr) if input_1 is not None and ext_input_1 is None: ext_input_1, values_input_1 = get_extension_size(input_1, start=start, end=end, stepsize=ext_stepsize) print("Read extension for first input file: %s" %ext_input_1, file=sys.stderr) if input_1 is not None and input_2 is not None and input_1 == input_2 and 'ext_input_1' in locals() and 'values_input_1' in locals(): ext_input_2, values_input_2 = ext_input_1, values_input_1 elif input_2 is not None and ext_input_2 is None: ext_input_2, values_input_2 = get_extension_size(input_2, start=start, end=end, stepsize=ext_stepsize) print("Read extension for second input file: %s" %ext_input_2, file=sys.stderr) tracker.write(text=str(ext_1) + "," + str(ext_2), header="Extension size IP1, IP2") if input_1 is not None and input_2 is not None: tracker.write(text=str(ext_input_1) + "," + str(ext_input_2), header="Extension size Control1, Control2") if verbose: if 'values_1' in locals() and values_1 is not None: with open(name + '-read-ext-1', 'w') as f: for v, i in values_1: print(i, v, sep='\t', file=f) if 'values_2' in locals() and values_2 is not None: with open(name + '-read-ext-2', 'w') as f: for v, i in values_2: print(i, v, sep='\t', file=f) if 'values_input_1' in locals() and values_input_1 is not None: with open(name + '-read-ext-input-1', 'w') as f: for v, i in values_input_1: print(i, v, sep='\t', file=f) if 'values_input_2' in locals() and values_input_2 is not None: with open(name + '-read-ext-input-2', 'w') as f: for v, i in values_input_2: print(i, v, sep='\t', file=f) cov_cdp_mpp = DualCoverageSet(name=name, region=regionset, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\ file_1=bam_file_1, ext_1=ext_1,\ file_2=bam_file_2, ext_2=ext_2, \ input_1=input_1, ext_input_1=ext_input_1, input_factor_1=input_factor_1, \ input_2=input_2, ext_input_2=ext_input_2, input_factor_2=input_factor_2, \ chrom_sizes=chrom_sizes, verbose=verbose, norm_strategy=norm_strategy, no_gc_content=no_gc_content, deadzones=deadzones,\ factor_input_1=factor_input_1, factor_input_2=factor_input_2, chrom_sizes_dict=chrom_sizes_dict, debug=debug, tracker=tracker) return cov_cdp_mpp, [ext_1, ext_2]
def fisher_table(motif_name, regions, mpbs, gene_set=False, mpbs_set=False): """ TODO Keyword arguments: motif_name -- TODO regions -- TODO mpbs -- TODO gene_set -- TODO mpbs_set -- TODO Return: a -- TODO b -- TODO gene_set -- TODO mpbs_set -- TODO """ # Fetching motif mpbs_motif = GenomicRegionSet(name="mpbs_motif") for region in mpbs.sequences: if motif_name in region.name: mpbs_motif.add(region) # Performing intersections if len(mpbs_motif) > 0: # regions which are overlapping with mpbs_motif intersect_original = regions.intersect(mpbs_motif, mode=OverlapType.ORIGINAL, rm_duplicates=True) # regions which are not overlapping with regions from mpbs_motif subtract_overlap = regions.subtract(mpbs_motif, whole_region=True) # Fetching genes if gene_set: gene_set_res = GeneSet(motif_name) for genomic_region in intersect_original.sequences: if genomic_region.name: gene_list = [ e if e[0] != "." else e[1:] for e in genomic_region.name.split(":") ] for g in gene_list: gene_set_res.genes.append(g) gene_set_res.genes = list(set( gene_set_res.genes)) # Keep only unique genes else: gene_set_res = None # Fetching mpbs if mpbs_set: mpbs_set_res = mpbs_motif.intersect(regions, mode=OverlapType.ORIGINAL, rm_duplicates=True) else: mpbs_set_res = None return len(intersect_original), len( subtract_overlap), gene_set_res, mpbs_set_res else: gene_set_res = GeneSet(motif_name) if gene_set else None mpbs_set_res = GenomicRegionSet(mpbs_motif.name) if mpbs_set else None return 0, len(regions), gene_set_res, mpbs_set_res
def match_single(motif, sequence, genomic_region, unique_threshold=None, normalize_bitscore=True, sort=False): """ Performs motif matching given sequence and the motif.pssm passed as parameter. The genomic_region is needed to evaluate the correct binding position. Please note that the arguments should be passed as a list, to allow for parallelization mapping function. Keyword arguments: motif -- TODO. sequence -- A DNA sequence (string). genomic_region -- A GenomicRegion. output_file -- TODO. unique_threshold -- If this argument is provided, the motif search will be made using a threshold of 0 and then accepting only the motif matches with bitscore/motif_length >= unique_threshold. Return: Print MPBSs to output_file. """ # Establishing threshold if unique_threshold: current_threshold = 0.0 eval_threshold = unique_threshold motif_max = motif.max / motif.len else: current_threshold = motif.threshold eval_threshold = motif.threshold motif_max = motif.max # Performing motif matching try: # old MOODS version results = MOODS.search(sequence, [motif.pssm_list], current_threshold, absolute_threshold=True, both_strands=True) except: # TODO: we can expand this to use bg from sequence, for example, # or from organism. bg = MOODS.tools.flat_bg(4) results = MOODS.scan.scan_dna(sequence, [motif.pssm_list], bg, [current_threshold], 7) grs = GenomicRegionSet("mpbs") for search_result in results: for r in search_result: try: position = r.pos score = r.score except: (position, score) = r # Verifying unique threshold acceptance if unique_threshold and score / motif.len < unique_threshold: continue # If match forward strand if position >= 0: p1 = genomic_region.initial + position strand = "+" # If match reverse strand elif not motif.is_palindrome: p1 = genomic_region.initial - position strand = "-" else: continue # Evaluating p2 p2 = p1 + motif.len # Evaluating score (integer between 0 and 1000 -- needed for bigbed transformation) if normalize_bitscore: # Normalized bitscore = standardize to integer between 0 and 1000 (needed for bigbed transformation) if motif_max > eval_threshold: norm_score = int(((score - eval_threshold) * 1000.0) / (motif_max - eval_threshold)) else: norm_score = 1000 else: # Keep the original bitscore if unique_threshold: norm_score = score / motif.len else: norm_score = score grs.add( GenomicRegion(genomic_region.chrom, int(p1), int(p2), name=motif.name, orientation=strand, data=str(norm_score))) if sort: grs.sort() return grs
def call_peaks(bam, csizes, pval, min_reads, binsize, cfile=None): ''' Call peaks on bam file using pvalue and binomial model. Returns GenomeRegionSet with peaks, and CoverageSet with signal. ''' # make chromsizes region set rs = get_chrom_sizes_as_genomicregionset(csizes) print("calculating extension sizes...") # calculate ext size ext, _ = get_extension_size(bam, start=0, end=300, stepsize=5) print("calculating coverage...") # calc coverage cov = CoverageSet('coverageset', rs) cov.coverage_from_bam(bam_file=bam, extension_size=ext, paired_reads=True) # calculate cov2 for output bw cov2 = CoverageSet('coverageset2', rs) cov2.coverage_from_bam(bam_file=bam, extension_size=ext, paired_reads=True, binsize=binsize, stepsize=binsize // 2) if cfile is not None: print(f"Using control file: {cfile}") control = CoverageSet('contorl', rs) control.coverage_from_bam(bam_file=cfile, extension_size=ext) with np.errstate(divide='ignore', invalid='ignore'): norm_igg(cov, control) # recalc overall coverage cov.overall_cov = reduce(lambda x, y: np.concatenate( (x, y)), [cov.coverage[i] for i in range(len(cov.genomicRegions))]) # total coverage s = np.sum(cov.overall_cov) # probability of event, a read in a bin, (avg reads/bin )/libsize p = np.mean(cov.overall_cov[cov.overall_cov > 0]) / s # what is the max coverage maxcov = np.max(cov.overall_cov) # create dict with probability for each count value mc = np.arange(0, maxcov + 1, dtype="object") d = {count: binom_test((count, s - count), p=p) for count in mc} # create GenomicRegionSet to hold peaks res = GenomicRegionSet('identified_peaks') print("calculating peaks...") # iterate through bins in genome, store peaks for i, c in enumerate(cov.overall_cov): if filter_bins(c, d, min_reads): chrom, s, e = cov.index2coordinates(i, rs) res.add(GenomicRegion(chrom, s, e + 1, data=d[c])) # merge ol peaks res.merge() # merge peaks within ext dist rc = res.cluster(ext) return rc, cov, cov2
def _intersect(self, y, rm_duplicates=False): """Return the overlapping regions with three different modes. (mode = OverlapType.ORIGINAL) Return the regions of original GenomicRegionSet which have any intersections with y. Keyword arguments: y -- the GenomicRegionSet which to compare with Return: z -- the regions of original GenomicRegionSet which have any intersections with y Graphical explanation: self ---------- ------ y ---------- ---- Result ---------- """ a = self b = y z = GenomicRegionSet(a.name + ' + ' + b.name) # XXX - someone putted an special symbol and spaces in the name! this is used as file name, never use strange characters. if len(a) == 0 or len(b) == 0: return z else: # If there is overlap within self or y, they should be merged first. if a.sorted == False: a.sort() if b.sorted == False: b.sort() iter_a = iter(a) s = iter_a.next() last_j = len(b)-1 j = 0 cont_loop = True ########################### OverlapType.ORIGINAL ################################### while cont_loop: #print(str(s),"\t",str(b[j])) # When the regions overlap if s.overlap(b[j]): z.add(s) try: s = iter_a.next() except: cont_loop = False elif s < b[j]: try: s = iter_a.next() except: cont_loop = False elif s > b[j]: if j == last_j: cont_loop = False else: j = j + 1 else: try: s = iter_a.next() except: cont_loop = False if rm_duplicates: z.remove_duplicates() return z