def mode_2(exp_matrix): #remember value of bedgraph, ugly way value = {} for regions in exp_matrix.get_regionsets(): for region in regions: value[(region.chrom, region.initial, region.final)] = region.data for region in exp_matrix.get_regionsets(): f = open("region_" + str(region.name) + ".data", 'w') region_set = GenomicRegionSet("") _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, threshDist=2000) for k in gene_peaks_mapping.keys(): chr, raw_positions = k.split(':') start, end = map(lambda x: int(x), raw_positions.split('-')) #if peak is not assigned, an empty string occurs if "" in gene_peaks_mapping[k]: gene_peaks_mapping[k].remove("") list = 'NA' if not gene_peaks_mapping[k] else ','.join(gene_peaks_mapping[k]) print(chr, start, end, value[(chr, start, end)], list, sep='\t', file = f) f.close()
def get_biotypes(self, gene_set=None): """Get the region sets of different Biotypes. *Keyword arguments:* *Return:* - result_grs -- A list of GenomicRegionSets containing the regions for each Biotype. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None # Fetching exons query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("exon") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] gr.name = e[self.GeneField.TRANSCRIPT_ID] result_grs.add(gr) if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def merge_rbs(self, rm_duplicate=False, asgene_organism=None, region_set=None, cutoff=0): """Merge the RNA binding regions which have overlap to each other and combine their corresponding DNA binding regions. extend -> Define the extending length in basepair of each RNA binding regions perfect_match -> Merge only the exactly same RNA binding regions """ # Merge RBS rna_merged = self.get_rbs() rna_merged.merge() # A dict: RBS as key, and GenomicRegionSet as its value new_dict = OrderedDict() for rbsm in rna_merged: regions = GenomicRegionSet(rbsm.toString()) for rd in self: if rbsm.overlap(rd.rna): regions.add(rd.dna) if rm_duplicate: regions.remove_duplicates() if len(regions) > cutoff: new_dict[rbsm] = regions if asgene_organism: try: new_dict[rbsm] = new_dict[rbsm].gene_association(organism=asgene_organism) except: pass if region_set: new_dict[rbsm].replace_region_name(regions=region_set) else: continue self.merged_dict = new_dict
def mode_2(exp_matrix): #remember value of bedgraph, ugly way value = {} for regions in exp_matrix.get_regionsets(): for region in regions: value[(region.chrom, region.initial, region.final)] = region.data for region in exp_matrix.get_regionsets(): f = open("region_" + str(region.name) + ".data", 'w') region_set = GenomicRegionSet("") _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, thresh_dist=2000) for k in list(gene_peaks_mapping.keys()): chr, raw_positions = k.split(':') start, end = [int(x) for x in raw_positions.split('-')] #if peak is not assigned, an empty string occurs if "" in gene_peaks_mapping[k]: gene_peaks_mapping[k].remove("") list = 'NA' if not gene_peaks_mapping[k] else ','.join(gene_peaks_mapping[k]) print(chr, start, end, value[(chr, start, end)], list, sep='\t', file = f) f.close()
def mode_3(exp_matrix): #remember value of bedgraph, ugly way score = {} for regions in exp_matrix.get_regionsets(): for region in regions: score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = region.data for region in exp_matrix.get_regionsets(): f = open("region_" + str(region.name) + ".data", 'w') region_set = GenomicRegionSet("") _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, thresh_dist=2000) avg_score = {} #score per peak genes = {} for peak, gene_list in list(gene_peaks_mapping.items()): for gen in gene_list: #reverse mapping peak -> gene to gene -> peak if not gen: continue genes[gen] = genes.get(gen, set()) genes[gen].add(peak) avg_score[gen] = avg_score.get(gen, []) avg_score[gen].append(score[peak]) #join all scores of peaks assigned to a gen for gen in list(genes.keys()): avg = sum([float(x) for x in avg_score[gen]])/ float(len(avg_score[gen])) print(gen, avg, ", ".join(str(t) for t in genes[gen]), sep='\t', file = f) f.close()
def get_transcripts(self, gene_set = None): """Gets transcripts of genes. It returns a GenomicRegionSet with such transcripts. The id of each gene will be put in the NAME field of each GenomicRegion. *Keyword arguments:* - gene_set -- A set of genes to narrow the search. *Return:* - result_grs -- A GenomicRegionSet containing the exons. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set) # Fetching exons if gene_set: query_dictionary = {self.GeneField.FEATURE_TYPE:"exon", self.GeneField.GENE_ID:mapped_gene_list} else: query_dictionary = {self.GeneField.FEATURE_TYPE:"exon"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("exon") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] gr.name = e[self.GeneField.TRANSCRIPT_ID] result_grs.add(gr) if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def get_genes(self, gene_set = None): """ Gets regions of genes. It returns a GenomicRegionSet with such genes. The id of each gene will be put in the NAME field of each GenomicRegion. Keyword arguments: gene_set -- A set of genes to narrow the search. Return: result_grs -- A GenomicRegionSet containing the genes. unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if(gene_set): mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set) # Fetching genes if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:"gene", self.GeneField.GENE_ID:mapped_gene_list} else: query_dictionary = {self.GeneField.FEATURE_TYPE:"gene"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("genes") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) result_grs.merge() if(gene_set): return result_grs, unmapped_gene_list else: return result_grs
def get_dbs(self, sort=False, orientation=None, rm_duplicate=False, dbd_tag=False): """Return GenomicRegionSet which contains all DNA binding sites""" dna_set = GenomicRegionSet(name="DNA_binding_sites") if len(self) == 0: return dna_set for rd in self.sequences: if dbd_tag: dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final, name=rd.rna.str_rna(), orientation=rd.dna.orientation, data=rd.score) else: dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final, name=rd.dna.name, orientation=rd.dna.orientation, data=rd.score) if not orientation: dna_set.add(dbs) else: if orientation == rd.orient: dna_set.add(dbs) else: pass if sort: dna_set.sort() if rm_duplicate: dna_set.remove_duplicates() return dna_set
def mode_1(exp_matrix): for region in exp_matrix.get_regionsets(): region_set = GenomicRegionSet("") _, _, mappedGenes, _, _ = region_set.filter_by_gene_association( region.fileName, None, gene_file, genome_file, thresh_dist=50000) print('#number of mapped genes:', mappedGenes) print(region.name + "\t" + ("\t".join(region_set.genes)))
def mode_3(exp_matrix): #remember value of bedgraph, ugly way score = {} for regions in exp_matrix.get_regionsets(): for region in regions: score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = region.data for region in exp_matrix.get_regionsets(): f = open("region_" + str(region.name) + ".data", 'w') region_set = GenomicRegionSet("") _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, threshDist=2000) avg_score = {} #score per peak genes = {} for peak, gene_list in gene_peaks_mapping.items(): for gen in gene_list: #reverse mapping peak -> gene to gene -> peak if not gen: continue genes[gen] = genes.get(gen, set()) genes[gen].add(peak) avg_score[gen] = avg_score.get(gen, []) avg_score[gen].append(score[peak]) #join all scores of peaks assigned to a gen for gen in genes.keys(): avg = sum(map(lambda x: float(x), avg_score[gen]))/ float(len(avg_score[gen])) print(gen, avg, ", ".join(str(t) for t in genes[gen]), sep='\t', file = f) f.close()
def region_sets(self,listA,listB): """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """ self.setA = GenomicRegionSet('for Unit Test') for i in range(len(listA)): self.setA.add(GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2])) self.setB = GenomicRegionSet('for Unit Test') for i in range(len(listB)): self.setB.add(GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2]))
def get_exons(self, start_site=False, end_site=False, gene_set=None, merge=True): """Gets exons of genes. It returns a GenomicRegionSet with such exons. The id of each gene will be put in the NAME field of each GenomicRegion. *Keyword arguments:* - start_site -- Whether to relocate the start sites. - end_site -- Whether to relocate the end sites. - gene_set -- A set of genes to narrow the search. *Return:* - result_grs -- A GenomicRegionSet containing the exons. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names( gene_set) # Fetching exons if gene_set: query_dictionary = { self.GeneField.FEATURE_TYPE: "exon", self.GeneField.GENE_ID: mapped_gene_list } else: query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("exon") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] # gr.name = e[self.GeneField.GENE_ID] gr.name = e[self.GeneField.TRANSCRIPT_ID] result_grs.add(gr) if start_site: result_grs.relocate_regions("leftend", left_length=1, right_length=1) elif end_site: result_grs.relocate_regions("rightend", left_length=1, right_length=1) if merge: result_grs.merge() if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def load_objects(self, is_bedgraph, verbose=False, test=False): """Load files and initialize object. *Keyword arguments:* - is_bedgraph -- Whether regions are in bedgraph format (default = False). - verbose -- Verbose output (default = False). - test -- Fetch only 10 regions form each BED files for test. """ for i, t in enumerate(self.types): if verbose: print("Loading file ", self.files[self.names[i]], file = sys.stderr) if t not in ["regions", "genes"] and verbose: print("Cannot load objects", file=sys.stderr) if t == "regions": regions = GenomicRegionSet(self.names[i]) if is_bedgraph: regions.read_bedgraph(os.path.abspath(self.files[self.names[i]])) else: if test: g = GenomicRegionSet(self.names[i]) g.read_bed(os.path.abspath(self.files[self.names[i]])) regions.sequences = g.sequences[0:11] else: regions.read_bed(os.path.abspath(self.files[self.names[i]])) # Here change the relative path into absolute path self.objectsDict[self.names[i]] = regions elif t == "genes": genes = GeneSet(self.names[i]) genes.read(os.path.abspath(self.files[self.names[i]])) # Here change the relative path into absolute path self.objectsDict[self.names[i]] = genes
def get_promoters(self, promoterLength=1000, gene_set=None, unmaplist=False): """ Gets promoters of genes given a specific promoter length. It returns a GenomicRegionSet with such promoters. The ID of each gene will be put in the NAME field of each GenomicRegion. Each promoter includes also the coordinate of the 5' base pair, therefore each promoter actual length is promoterLength+1. Keyword arguments: promoterLength -- The length of the promoter region. gene_set -- A set of genes to narrow the search. Return: result_grs -- A GenomicRegionSet containing the promoters. unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if (gene_set): mapped_gene_list, unmapped_gene_list = self.fix_gene_names( gene_set) # Fetching genes #if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:"gene", self.GeneField.GENE_ID:mapped_gene_list} #else: query_dictionary = {self.GeneField.FEATURE_TYPE:"gene"} if (gene_set): query_dictionary = { self.GeneField.FEATURE_TYPE: "transcript", self.GeneField.GENE_ID: mapped_gene_list } else: query_dictionary = {self.GeneField.FEATURE_TYPE: "transcript"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("promoters") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] if (gr.orientation == "+"): gr.final = gr.initial + 1 gr.initial = gr.initial - promoterLength else: gr.initial = gr.final - 1 gr.final = gr.initial + promoterLength + 1 gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) if unmaplist: return result_grs, unmapped_gene_list else: return result_grs
def get_dbs(self, sort=False, orientation=None, rm_duplicate=False): """Return GenomicRegionSet which contains all DNA binding sites""" dna_set = GenomicRegionSet(name="DNA_binding_sites") for rd in self.sequences: if not orientation: dna_set.add(rd.dna) else: if orientation == rd.orient: dna_set.add(rd.dna) else: pass if sort: dna_set.sort() if rm_duplicate: dna_set.remove_duplicates() return dna_set
def sort_dbs_by_regions(self, regionset): """Sort the DBS by given GenomicRegionSet""" dbss = self.get_dbs(sort=True) result = {} if not regionset.sorted: regionset.sort() iter_dbs = iter(dbss) dbs = iter_dbs.next() last_j = len(regionset) - 1 j = 0 cont_loop = True pre_inter = 0 cont_overlap = False while cont_loop: # When the regions overlap if dbs.overlap(regionset[j]): result[regionset[j].toString()].add(dbs) if cont_overlap == False: pre_inter = j if j == last_j: try: dbs = iter_dbs.next() except: cont_loop = False else: j = j + 1 result[regionset[j].toString()] = GenomicRegionSet( "RBS_" + regionset[j].toString()) cont_overlap = True elif dbs < regionset[j]: try: dbs = iter_dbs.next() j = pre_inter cont_overlap = False except: cont_loop = False elif dbs > regionset[j]: if j == last_j: cont_loop = False else: j = j + 1 result[regionset[j].toString()] = GenomicRegionSet( "RBS_" + regionset[j].toString()) cont_overlap = False return result
def get_promoters(self, promoterLength=1000, gene_set=None, unmaplist=False, variants=False): """ Gets promoters of genes given a specific promoter length. It returns a GenomicRegionSet with such promoters. The ID of each gene will be put in the NAME field of each GenomicRegion. Each promoter includes also the coordinate of the 5' base pair, therefore each promoter actual length is promoterLength+1. *Keyword arguments:* - promoterLength -- The length of the promoter region. - gene_set -- A set of genes to narrow the search. - unmaplist -- If True than also return the unmappable genes list (default = False). *Return:* - result_grs -- A GenomicRegionSet containing the promoters. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set) # Fetching genes if not variants: target = "gene" else: target = "transcript" if(gene_set): query_dictionary = {self.GeneField.FEATURE_TYPE:target, self.GeneField.GENE_ID:mapped_gene_list} else: query_dictionary = {self.GeneField.FEATURE_TYPE:target} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("promoters") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] if gr.orientation == "+": gr.final = gr.initial + 1 gr.initial = gr.initial - promoterLength else: gr.initial = gr.final - 1 gr.final = gr.initial + promoterLength + 1 gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) if unmaplist: return result_grs, unmapped_gene_list else: return result_grs
def region_sets(self, listA, listB): """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """ self.setA = GenomicRegionSet('for Unit Test') for i in range(len(listA)): self.setA.add( GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2])) self.setB = GenomicRegionSet('for Unit Test') for i in range(len(listB)): self.setB.add( GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2]))
def match_ms_tags(self, field, test=False): """Add more entries to match the missing tags of the given field. For example, there are tags for cell like 'cell_A' and 'cell_B' for reads, but no these tag for regions. Then the regions are repeated for each tags from reads to match all reads. *Keyword arguments:* - field -- Field to add extra entries. """ # check regions or reads have empty tag altypes = self.fieldsDict[field].keys() if "ALL" in altypes: altypes.remove("ALL") for name in self.fieldsDict[field]["ALL"]: i = self.names.index(name) for t in altypes: # print("\t"+t) n = name + "_" + t # print("\t\t"+n) self.names.append(n) self.types.append(self.types[i]) self.files[n] = self.files[name] # types = self.get_types(name,skip_all=True) # print("************") # print(types) for f in self.fields[3:]: if f == field: try: self.fieldsDict[f][t].append(n) except: self.fieldsDict[f][t] = [n] else: try: self.fieldsDict[f][self.get_type( name=name, field=f)].append(n) except: self.fieldsDict[f][self.get_type( name=name, field=f)] = [n] # for f in self.fieldsDict.keys(): # for ty in types: # try: self.fieldsDict[f][ty].append(n) # except: pass if self.types[i] == "regions": g = GenomicRegionSet(n) g.read_bed(self.files[name]) if test: g.sequences = g.sequences[0:11] self.objectsDict[n] = g self.trash.append(name)
def test_get_genome_data(self): """hg19""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19") self.assertEqual(len(result), 23) """hg19, with Mitochondria chromosome""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19", chrom_M=True) self.assertEqual(len(result), 24)
def load_objects(self, is_bedgraph, verbose=False, test=False): """Load files and initialize object. *Keyword arguments:* - is_bedgraph -- Whether regions are in bedgraph format (default = False). - verbose -- Verbose output (default = False). - test -- Fetch only 10 regions form each BED files for test. """ for i, t in enumerate(self.types): if verbose: print("Loading file ", self.files[self.names[i]], file=sys.stderr) if t not in ["regions", "genes"] and verbose: print("Cannot load objects", file=sys.stderr) if t == "regions": regions = GenomicRegionSet(self.names[i]) if is_bedgraph: regions.read_bedgraph( os.path.abspath(self.files[self.names[i]])) else: regions.read_bed(os.path.abspath( self.files[self.names[i]])) if test: regions.sequences = regions.sequences[0:11] self.objectsDict[self.names[i]] = regions elif t == "genes": genes = GeneSet(self.names[i]) genes.read( os.path.abspath(self.files[self.names[i]]) ) # Here change the relative path into absolute path self.objectsDict[self.names[i]] = genes
def match_ms_tags(self,field): """Add more entries to match the missing tags of the given field. For example, there are tags for cell like 'cell_A' and 'cell_B' for reads, but no these tag for regions. Then the regions are repeated for each tags from reads to match all reads. *Keyword arguments:* - field -- Field to add extra entries. """ # print(field) # print(self.fieldsDict) # check regions or reads have empty tag altypes = self.fieldsDict[field].keys() if "ALL" in altypes: altypes.remove("ALL") for name in self.fieldsDict[field]["ALL"]: # print(name) i = self.names.index(name) for t in altypes: # print("\t"+t) n = name+"_"+t # print("\t\t"+n) self.names.append(n) self.types.append(self.types[i]) self.files[n] = self.files[name] # types = self.get_types(name,skip_all=True) # print("************") # print(types) for f in self.fields[3:]: if f == field: try: self.fieldsDict[f][t].append(n) except: self.fieldsDict[f][t] = [n] else: try: self.fieldsDict[f][self.get_type(name=name,field=f)].append(n) except: self.fieldsDict[f][self.get_type(name=name,field=f)] = [n] # for f in self.fieldsDict.keys(): # for ty in types: # try: self.fieldsDict[f][ty].append(n) # except: pass if self.types[i] == "regions": g = GenomicRegionSet(n) g.read_bed(self.files[name]) self.objectsDict[n] = g self.trash.append(name)
def mode_3(exp_matrix, thresh, type_file): #remember value of bedgraph, ugly way score = {} for regions in exp_matrix.get_regionsets(): for region in regions: if type_file=="ODIN": aux=(region.data).split("\t") aux=aux[-1].split(";") score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = float(region.data[-1]) if type_file=="THOR": aux=(region.data).split(";") score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = float(aux[-1]) else: score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = region.data for i, region in enumerate(exp_matrix.get_regionsets()): f = open("region_" + str(region.name) + ".data", 'w') region_set = GenomicRegionSet("") _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association_old(region.fileName, None, gene_file, genome_file, threshDist=thresh) avg_score = {} #score per peak genes = {} print('Consider row %s of exp. matrix, number of mapped genes is %s' %(i, mappedGenes), file=sys.stderr) for peak, gene_list in gene_peaks_mapping.items(): for gen in gene_list: #reverse mapping peak -> gene to gene -> peak if not gen: continue genes[gen] = genes.get(gen, set()) genes[gen].add(peak) avg_score[gen] = avg_score.get(gen, []) avg_score[gen].append(score[peak]) #join all scores of peaks assigned to a gen for gen in genes.keys(): if options.metric == 'mean': avg = np.mean(avg_score[gen]) elif options.metric == 'max': avg = np.max(avg_score[gen]) print(gen, avg, ", ".join(str(t) for t in genes[gen]), sep='\t', file = f) f.close()
def get_tts(self, gene_set=None): """Gets TTS(Transcription termination site) of genes. It returns a GenomicRegionSet with such TTS. The ID of each gene will be put in the NAME field of each GenomicRegion. *Keyword arguments:* - gene_set -- A set of genes to narrow the search. *Return:* - result_grs -- A GenomicRegionSet containing TTS. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set) # Fetching genes if gene_set: query_dictionary = {self.GeneField.FEATURE_TYPE: "gene", self.GeneField.GENE_ID: mapped_gene_list} else: query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("TTS") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] if gr.orientation == "+": gr.initial = gr.initial gr.final = gr.initial + 1 else: gr.initial = gr.final - 1 gr.final = gr.final gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) result_grs.merge() if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def merge_rbs(self, rm_duplicate=False, asgene_organism=None, cutoff=0): """Merge the RNA binding regions which have overlap to each other and combine their corresponding DNA binding regions. extend -> Define the extending length in basepair of each RNA binding regions perfect_match -> Merge only the exactly same RNA binding regions """ # Merge RBS rna_merged = self.get_rbs() rna_merged.merge() # A dict: RBS as key, and GenomicRegionSet as its value new_dict = OrderedDict() for rbsm in rna_merged: regions = GenomicRegionSet(rbsm.toString()) for rd in self: if rbsm.overlap(rd.rna): regions.add(rd.dna) if rm_duplicate: regions.remove_duplicates() if len(regions) > cutoff: new_dict[rbsm] = regions if asgene_organism: try: new_dict[rbsm] = new_dict[rbsm].gene_association(organism=asgene_organism) except: print("* No annotation file for mapping associated genes.") else: continue self.merged_dict = new_dict
def load_objects(self, is_bedgraph, verbose=False): """Load files and initialize object""" for i, t in enumerate(self.types): if verbose: print("Loading file ", self.files[self.names[i]], file=sys.stderr) if t not in ["regions", "genes"] and verbose: print("Cannot load objects", file=sys.stderr) if t == "regions": regions = GenomicRegionSet(self.names[i]) if is_bedgraph: regions.read_bedgraph( os.path.abspath(self.files[self.names[i]])) else: regions.read_bed( os.path.abspath(self.files[self.names[i]]) ) # Here change the relative path into absolute path self.objectsDict[self.names[i]] = regions elif t == "genes": genes = GeneSet(self.names[i]) genes.read( os.path.abspath(self.files[self.names[i]]) ) # Here change the relative path into absolute path self.objectsDict[self.names[i]] = genes
def get_exons(self, start_site=False, end_site=False, gene_set=None, merge=True): """Gets exons of genes. It returns a GenomicRegionSet with such exons. The id of each gene will be put in the NAME field of each GenomicRegion. *Keyword arguments:* - start_site -- Whether to relocate the start sites. - end_site -- Whether to relocate the end sites. - gene_set -- A set of genes to narrow the search. *Return:* - result_grs -- A GenomicRegionSet containing the exons. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names(gene_set) # Fetching exons if gene_set: query_dictionary = {self.GeneField.FEATURE_TYPE: "exon", self.GeneField.GENE_ID: mapped_gene_list} else: query_dictionary = {self.GeneField.FEATURE_TYPE: "exon"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("exon") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] # gr.name = e[self.GeneField.GENE_ID] gr.name = e[self.GeneField.TRANSCRIPT_ID] result_grs.add(gr) if start_site: result_grs.relocate_regions("leftend", left_length=1, right_length=1) elif end_site: result_grs.relocate_regions("rightend", left_length=1, right_length=1) if merge: result_grs.merge() if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def test_get_genome_data(self): """hg19""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19") self.assertEqual(len(result), 23) """hg19, with Mitochondria chromosome""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19",chrom_M=True) self.assertEqual(len(result), 24)
def merge_by(self, rbss, rm_duplicate=False, asgene_organism=False): """Merge the RNA Binding Sites by the given list of Binding sites""" new_dict = OrderedDict() for rbsm in rbss: new_dict[rbsm] = GenomicRegionSet(rbsm.toString()) for rd in self: if rbsm.overlap(rd.rna): new_dict[rbsm].add(rd.dna) if rm_duplicate: new_dict[rbsm].remove_duplicates() if asgene_organism: try: new_dict[rbsm] = new_dict[rbsm].gene_association(organism=asgene_organism) except: print("* No annotation file for mapping associated genes.") self.merged_dict = new_dict
def get_tts(self, gene_set=None): """Gets TTS(Transcription termination site) of genes. It returns a GenomicRegionSet with such TTS. The ID of each gene will be put in the NAME field of each GenomicRegion. *Keyword arguments:* - gene_set -- A set of genes to narrow the search. *Return:* - result_grs -- A GenomicRegionSet containing TTS. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list = self.fix_gene_names( gene_set) # Fetching genes if gene_set: query_dictionary = { self.GeneField.FEATURE_TYPE: "gene", self.GeneField.GENE_ID: mapped_gene_list } else: query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("TTS") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] if gr.orientation == "+": gr.initial = gr.initial gr.final = gr.initial + 1 else: gr.initial = gr.final - 1 gr.final = gr.final gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) result_grs.merge() if gene_set: return result_grs, unmapped_gene_list else: return result_grs
def get_genes(self, gene_set=None): """ Gets regions of genes. It returns a GenomicRegionSet with such genes. The id of each gene will be put in the NAME field of each GenomicRegion. Keyword arguments: gene_set -- A set of genes to narrow the search. Return: result_grs -- A GenomicRegionSet containing the genes. unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if (gene_set): mapped_gene_list, unmapped_gene_list = self.fix_gene_names( gene_set) # Fetching genes if (gene_set): query_dictionary = { self.GeneField.FEATURE_TYPE: "gene", self.GeneField.GENE_ID: mapped_gene_list } else: query_dictionary = {self.GeneField.FEATURE_TYPE: "gene"} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("genes") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] gr.name = e[self.GeneField.GENE_ID] result_grs.add(gr) result_grs.merge() if (gene_set): return result_grs, unmapped_gene_list else: return result_grs
def load_exon_sequence(bed, directory, genome_path): """Load the exon sequence from the the transcripts. Input BED format should contain: blockCount - The number of blocks (exons) in the BED line. blockSizes - A comma-separated list of the block sizes. blockStarts - A comma-separated list of block starts. see details: http://genome.ucsc.edu/FAQ/FAQformat#format1 Output: Each FASTA file represants a transcript and contains all the exons within the file. """ regionset = GenomicRegionSet("bed") regionset.read_bed(bed) regionset.sort() genome = pysam.Fastafile(genome_path) try: if len(regionset.sequences[0].data.split("\t")) == 7: blockinfor = True no_exon = False except: blockinfor = False regionset.sequences.sort(key=lambda g: g.name) no_exon = True if blockinfor: for gr in regionset: if not gr.name: print("Error: For fetching exon sequences, please define the transcript name.") sys.exit() else: if not os.path.exists(directory): os.makedirs(directory) f = open(os.path.join(directory, gr.name+".fa"), "w") data = gr.data.split("\t") #print(len(data)) if len(data) == 7: #print(data) n = int(data[4]) blocks = [ int(b) for b in filter(None, data[5].split(",")) ] starts = [ int(s) for s in filter(None, data[6].split(",")) ] printstr = [] for i in range(n): start = gr.initial + starts[i] end = start + blocks[i] if no_exon and i == 0: ex = "" elif gr.orientation == "-": ex = "exon:"+str(n-i) else: ex = "exon:"+str(i+1) if gr.orientation == "-": seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna) seq = seq.reverse_complement() p = [ ">"+ " ".join([ gr.name, ex, "_".join(["REGION",gr.chrom, str(start),str(end), gr.orientation]) ]), seq ] printstr.append(p) else: p = [ ">"+ " ".join([gr.name, ex, "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]), genome.fetch(gr.chrom, start-1, end-1) ] printstr.append(p) if gr.orientation == "-": printstr = printstr[::-1] for i in range(n): print(printstr[i][0], file=f) print(printstr[i][1], file=f) else: print("Warning: The given regions have no block information, please try write_bed_blocks") f.close() else: pre_id = "" for gr in regionset: if not gr.name: gr.name = gr.toString() if pre_id == "": pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) elif gr.name == pre_id: z.add(gr) else: f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close() pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) # Last TX f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close()
annotation_path = args[2] outputdir = args[3] # experimental_matrix_file = "/home/manuel/workspace/cluster_p/THOR/exp/exp23_macs2_payton/1" # gene_exp = "/home/manuel/workspace/cluster_p/allhoff/project_THOR/data/payton/gene_expression/CCmean.data" # annotation_path = "/home/manuel/workspace/cluster_h/rgtdata/hg19/" # outputdir = "/home/manuel/test/" exps = ExperimentalMatrix() exps.read(experimental_matrix_file) regionsets = exps.get_regionsets() genome_file = annotation_path + "/chrom.sizes" gene_file = annotation_path + "/association_file.bed" genes = GeneSet("Expression") genes.read_expression(gene_exp) for region in regionsets: bedNew = GenomicRegionSet("") [degenes, de_peak_genes, mappedGenes, totalPeaks, regionsToGenes] \ = bedNew.filter_by_gene_association_old(region.fileName, genes.genes, gene_file, genome_file, threshDist=options.dist) [ct, labels] = averageExpression(region, genes, regionsToGenes) fileName = path.splitext(path.basename(region.fileName))[0] output(genes.cond, labels, ct, path.join(outputdir, fileName + ".txt"))
class TestGenomicRegionSet(unittest.TestCase): def region_sets(self,listA,listB): """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """ self.setA = GenomicRegionSet('for Unit Test') for i in range(len(listA)): self.setA.add(GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2])) self.setB = GenomicRegionSet('for Unit Test') for i in range(len(listB)): self.setB.add(GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2])) def test_extend(self): """ Two empty sets A : none R : none """ self.region_sets([], []) self.setA.extend(100,100) self.assertEqual(len(self.setA.sequences), 0) """ One region A : ----- R : --------- """ self.region_sets([['chr1',5,10]], []) result = self.setA result.extend(4,4) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 14) """ Many region A : ----- ------ ----- ----- R : --------=--------- ------------------ """ self.region_sets([['chr1',5,10],['chr1',15,20],['chr1',40,50],['chr1',65,75]], []) result = self.setA result.extend(5,5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) """ Many region in different chromosome A : ----- ------ ----- ----- R : none """ self.region_sets([['chr1',5,10],['chr2',15,20],['chr3',40,50],['chr4',65,75]], []) result = self.setA result.extend(5,5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[0].chrom, 'chr1') self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[1].chrom, 'chr2') self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[2].chrom, 'chr3') self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) self.assertEqual(result[3].chrom, 'chr4') """ One region A : ----- R : --------- """ self.region_sets([['chr1',100,200]], []) result = self.setA result.extend(10,10,percentage=True) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 90) self.assertEqual(result[0].final, 210) def test_sort(self): self.region_sets([['chr1',15,20],['chr1',40,50],['chr1',65,75],['chr1',5,10]], []) self.setA.sort() def test_intersect(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ One empty set A : ----- B : none R : none """ self.region_sets([['chr1',5,10]], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : none B : ----- R : none """ self.region_sets([], [['chr1',5,10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No overlapping A : ------ --------- ------- B : ---- ------ ------ R : none """ self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]], [['chr1',7,9],['chr1',20,25],['chr1',26,31]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ End-to-end attach A : ------ ------ B : ------ R : none """ self.region_sets([['chr1',1,5],['chr1',11,20]], [['chr1',5,11]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No length attach A : . . B : . . R : none """ self.region_sets([['chr1',2,2],['chr1',20,20]], [['chr1',5,5],['chr1',20,20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Perfect overlapping A : ------ B : ------ R : ------ """ self.region_sets([['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]], [['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP, rm_duplicates=True) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ One overlapping region A : ------ B : -------- R1: -- (overlap) R2: ------ (original) R3: (comp_incl) """ self.region_sets([['chr1',1,10]], [['chr1',7,20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two simple overlapping regions A : ------- -------- B : ------------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1',1,10],['chr1',26,35]], [['chr1',7,30]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 30) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two separately overlapping regions A : ------- -------- B : ----- -------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1',1,10],['chr1',26,35]], [['chr1',7,15],['chr1',30,40]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 30) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Many various overlapping (mixed) A : ------------------ -------- --------- B : ---- ------- ------ ---------- R1: -- ------- -- ---- --- (overlap) R2: ------------------ -------- --------- (original) R3: (comp_incl) """ self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]], [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 27) self.assertEqual(result[2].final, 30) self.assertEqual(result[3].initial, 55) self.assertEqual(result[3].final, 60) self.assertEqual(result[4].initial, 70) self.assertEqual(result[4].final, 75) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 85) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Different chromosomes A : chr1 ------- B : chr2 ------- R : none """ self.region_sets([['chr1',1,10]], [['chr2',1,10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Completely included overlapping A : --------------------------- B : ---- ------ ----------- R1: ---- ------ ------ (overlap) R2: --------------------------- (original) R3: (comp_incl) """ self.region_sets([['chr1',1,50]], [['chr1',1,5],['chr1',10,19],['chr1',45,60]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : ---- ------ ----------- B : --------------------------- R1: ---- ------ ------ (overlap) R2: ---- ------ ----------- (original) R3: ---- ------ (comp_incl) """ self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]], [['chr1',1,50]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 60) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) """ A : -------------- ------- ------ B : ----- ---------------- R1: ----- ------- (overlap) ---- R2: -------------- ------- (original) ------ R3: ------- (comp_incl) """ self.region_sets([['chr1',1,50],['chr1',20,40],['chr1',70,80]], [['chr1',25,45],['chr1',65,95]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 25) self.assertEqual(result[0].final, 45) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[1].initial, 20) self.assertEqual(result[1].final, 40) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 70) self.assertEqual(result[0].final, 80) def test_closest(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.closest(self.setB) self.assertEqual(len(result), 0) # """ # One empty set # A : ----- # B : none # R : none # """ # self.region_sets([['chr1',5,10]], # []) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # A : none # B : ----- # R : none # """ # self.region_sets([], # [['chr1',5,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Overlapping within set # A : -----====----- # B : ---- # R : ---- # """ # self.region_sets([['chr1',1,10],['chr1',6,15]], # [['chr1',6,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # A : ---- # B : -----====----- # R : -----====----- # """ # self.region_sets([['chr1',6,10]], # [['chr1',1,10],['chr1',6,15]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # """ # No overlapping # A : ------ --------- ------- # B : ---- ------ ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]], # [['chr1',7,9],['chr1',20,25],['chr1',26,31]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # # self.assertEqual(result[0].initial, 20) # # self.assertEqual(result[0].final, 25) # """ # End-to-end attach # A : ------ ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20]], # [['chr1',5,11]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # # self.assertEqual(result[0].initial, 5) # # self.assertEqual(result[0].final, 11) # """ # Perfect overlapping # A : ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,10]], # [['chr1',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 1) # self.assertEqual(result[0].final, 10) # """ # One overlapping region # A : ------ # B : -------- # R : -------- # """ # self.region_sets([['chr1',1,10]], # [['chr1',7,20]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 20) # """ # Two simple overlapping regions # A : ------- -------- # B : ------------- # R : ------------- # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,30]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 30) # """ # Two separately overlapping regions # A : ------- -------- # B : ----- -------- # R : none # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,15],['chr1',30,40]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # Many various overlapping (mixed) # A : ------------------ -------- --------- # B : ---- ------- ------ ---------- # R : none # """ # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]], # [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 4) # """ # Different chromosomes # A : chr1 ------- # B : chr2 ------- # R : chr2 ------- # # """ # self.region_sets([['chr1',1,10]], # [['chr2',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Completely included overlapping # A : --------------------------- # B : ---- ------ ----------- # R : ---- ------ ----------- # """ # self.region_sets([['chr1',1,50]], # [['chr1',1,5],['chr1',10,19],['chr1',45,60]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # """ # A : ---- ------ ----------- # B : --------------------------- # R : none # """ # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]], # [['chr1',1,50]]) # result = self.setA.closest(self.setB) # self.assertEqual(result, False) # """ # A : ---- ------ --- # B : --- ----- # R : --- # """ # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]], # [['chr1',15,20],['chr1',55,65]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 15) # self.assertEqual(result[0].final, 20) def test_remove_duplicates(self): """ A : ===== ----- R : ----- ----- """ self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A : =====--- ----- R : =====--- ----- """ self.region_sets([['chr1',1,10],['chr1',1,15],['chr1',20,25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 1) self.assertEqual(result[1].final, 15) self.assertEqual(result[2].initial, 20) self.assertEqual(result[2].final, 25) """ A : ===== ----- ------ ==== R : ----- ----- ------ ---- """ self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25],['chr1',30,35],['chr1',40,45],['chr1',40,45]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 30) self.assertEqual(result[2].final, 35) self.assertEqual(result[3].initial, 40) self.assertEqual(result[3].final, 45) def test_window(self): """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 100 R : - only one base overlaps with extending A """ self.region_sets([['chr1',200,300]], [['chr1',1,101],['chr1',499,550]]) result = self.setA.window(self.setB,adding_length=100) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 100) self.assertEqual(result[0].final, 101) """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 200 R : ------ - left-hand side is covered, and the right-hand side is only one base overlapped """ self.region_sets([['chr1',200,300]], [['chr1',1,101],['chr1',499,550]]) result = self.setA.window(self.setB,adding_length=200) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) # GenomicRegion.extend will choose 1 rather than 0 self.assertEqual(result[0].final, 101) self.assertEqual(result[1].initial, 499) self.assertEqual(result[1].final, 500) """ A : ---- ---- B : -------- ---- window = 1000 (default) R : ---- ---- """ self.region_sets([['chr1',3000,3500],['chr1',4000,4500]], [['chr1',1500,2500],['chr1',5000,5500]]) result = self.setA.window(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 2000) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) """ A : ---- ---- B : -------- ---- window = 2000 R : -------- ---- ---- ---- window = 100 R : none """ self.region_sets([['chr1',3000,3500],['chr1',4000,4500]], [['chr1',1500,2500],['chr1',5000,5500]]) result = self.setA.window(self.setB,adding_length=2000) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1500) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) result = self.setA.window(self.setB,adding_length=100) self.assertEqual(len(result), 0) def test_subtract(self): """ A : none B : ------ R : none """ self.region_sets([], [['chr1',6,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ------ B : none R : ------ """ self.region_sets([['chr1',6,15]], []) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 6) self.assertEqual(result[0].final, 15) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1',1,10]], [['chr1',6,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1',6,15]], [['chr1',1,10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 10) self.assertEqual(result[0].final, 15) """ A : --- B : --------- R : none """ self.region_sets([['chr1',6,10]], [['chr1',1,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : --------- B : --- R : --- --- """ self.region_sets([['chr1',1,15]], [['chr1',6,10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 15) """ A : ------ B : ------ R : none """ self.region_sets([['chr1',6,15]], [['chr1',6,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ---------- ------ B : ---------- ---- R : ------- ------ """ self.region_sets([['chr1',5,30],['chr1',70,85]], [['chr1',20,50],['chr1',100,110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 5) self.assertEqual(result[0].final, 20) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 85) """ A : ------ ----- B : ------ R : ---- ----- """ self.region_sets([['chr1',20,30],['chr1',35,55]], [['chr1',10,23],['chr1',100,110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 23) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 35) self.assertEqual(result[1].final, 55) """ A : ch1 --------------------- ch2 ------------------------- B : ch1 ------ ch2 ------ R : ch1 -------- ------- ch2 ------------------- """ self.region_sets([['chr1',0,30000],['chr2',0,35000]], [['chr1',20000,23000],['chr2',31000,35000]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 20000) self.assertEqual(result[1].initial, 23000) self.assertEqual(result[1].final, 30000) self.assertEqual(result[2].initial, 0) self.assertEqual(result[2].final, 31000) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets([['chr1',5,1000]], [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 5) """ A : ----------------------- ------ ----- ----- ----------- B : --- --------- ---- ---- R : - ---- ------ ---- --- --- ---- --- """ self.region_sets([['chr1',5,100],['chr1',20,40],['chr1',60,80],['chr1',95,150],['chr1',180,220]], [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]]) result = self.setA.subtract(self.setB) #print(result.sequences) self.assertEqual(len(result), 8) self.assertEqual(result[0].initial, 5) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets([['chr1',5,1000],['chr2',5,1000],['chr4',5,1000]], [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240], ['chr2',10,15],['chr2',30,70],['chr2',120,140],['chr2',200,240], ['chr4',10,15],['chr4',30,70],['chr4',120,140],['chr4',200,240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 15) def test_merge(self): """ A : none R : none """ self.region_sets([], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 0) """ A : ----- ----- R : ----- ----- """ self.region_sets([['chr1',1,10],['chr1',15,25]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A1: ------------ ---- A2: ----- R : ------------ ---- """ self.region_sets([['chr1',1,30],['chr1',11,20],['chr1',40,50]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 40) self.assertEqual(result[1].final, 50) """ A1: -------- ---- A2: --------- R : ------------ ---- """ self.region_sets([['chr1',1,30],['chr1',20,40],['chr1',50,60]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 40) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) """ A : ======= R : ------- """ self.region_sets([['chr1',1,30],['chr1',1,30]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) def test_cluster(self): """ Empty sets A : none R : none """ self.region_sets([], []) result = self.setA.cluster(10) self.assertEqual(len(result), 0) """ A : ------- R : ------- """ self.region_sets([['chr1',1,10]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ A : ----- ------ R : ----------- """ self.region_sets([['chr1',1,10],['chr1',10,20]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 20) """ A : ----- ----- R1: ----- ----- R2: ------------ """ self.region_sets([['chr1',1,10],['chr1',15,25]], []) result = self.setA.cluster(1) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(5) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(6) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 25) """ A : ---- ---- ---- ---- ---- R1: --------- ---- ---- ---- R2: --------------- ---- ---- R3: ---------------------- ---- R4: ------------------------------ R5: ------------------------------ """ self.region_sets([['chr1',1,10],['chr1',15,25],['chr1',35,45], ['chr1',60,70],['chr1',90,100]], []) result = self.setA.cluster(6) self.assertEqual(len(result), 4) result = self.setA.cluster(11) self.assertEqual(len(result), 3) result = self.setA.cluster(16) self.assertEqual(len(result), 2) result = self.setA.cluster(21) self.assertEqual(len(result), 1) result = self.setA.cluster(26) self.assertEqual(len(result), 1) def test_flank(self): """ A : ----- R1: --- --- """ self.region_sets([['chr1',60,75]], []) result = self.setA.flank(10) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 50) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 85) """ A : ----- ---- R1: ----- ===== ---- """ self.region_sets([['chr1',60,75],['chr1',90,100]], []) result = self.setA.flank(15) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 45) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 90) self.assertEqual(result[2].initial, 75) self.assertEqual(result[2].final, 90) self.assertEqual(result[3].initial, 100) self.assertEqual(result[3].final, 115) def test_jaccard(self): """ self --8-- ---10--- -4- y ---10--- ---10--- intersect -5- -4- similarity: ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )] = 9/33 """ self.region_sets([['chr1',50,58],['chr1',70,80],['chr1',90,94]], [['chr1',45,55],['chr1',76,86]]) result = self.setA.jaccard(self.setB) self.assertEqual(result, 9/33) def test_get_genome_data(self): """hg19""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19") self.assertEqual(len(result), 23) """hg19, with Mitochondria chromosome""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19",chrom_M=True) self.assertEqual(len(result), 24) def test_random_regions(self): self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=False) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=False) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=True) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=True) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False, chrom_M=True) result.sort()
if __name__ == "__main__": import sys from rgt.GenomicRegionSet import * bam_file=sys.argv[1] fasta_file=sys.argv[2] bed_file=sys.argv[3] kmer=int(sys.argv[4]) shift=int(sys.argv[5]) out=sys.argv[6] regions=GenomicRegionSet("regions") regions.read_bed(bed_file) table=BiasTable(regions=regions,dnase_file_name=bam_file,genome_file_name=fasta_file,k_nb=kmer,shift=shift) table.write_tables(out)
def mode_4(exp_matrix,thresh,type_file,geneexp_file): #remember value of bedgraph, ugly way gene_set = GeneSet("") gene_set.read_expression(geneexp_file) score = {} for regions in exp_matrix.get_regionsets(): for region in regions: if type_file=="ODIN": aux=(region.data).split("\t") aux=aux[-1].split(";") score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = aux[-1] else: score[(region.chrom + ':' + str(region.initial) + '-' + str(region.final))] = region.data for region in exp_matrix.get_regionsets(): f = open("region_" + str(region.name) + ".data", 'w') region_set = GenomicRegionSet("") _, _, mappedGenes, _, gene_peaks_mapping = region_set.filter_by_gene_association_old(region.fileName, gene_set.genes, gene_file, genome_file, threshDist=thresh) print(mappedGenes) #region.filter_by_gene_association(organism=organism,threshDist=thresh) # _, _, mappedGenes, _, gene_peaks_mapping avg_score = {} #score per peak genes = {} print(region) for peak, gene_list in gene_peaks_mapping.items(): for gen in gene_list: #reverse mapping peak -> gene to gene -> peak if not gen: continue genes[gen] = genes.get(gen, set()) genes[gen].add(peak) avg_score[gen] = avg_score.get(gen, []) avg_score[gen].append(score[peak]) #join all scores of peaks assigned to a gen print(avg_score) for gen in gene_set.genes: try: avg = sum(map(lambda x: float(x), avg_score[gen]))/ float(len(avg_score[gen])) peaks = ", ".join(str(t) for t in genes[gen]) siz=avg*len(avg_score[gen]) except: avg = 0.0 siz=0 peaks = "_" try: print(gen, "\t".join([str(t) for t in gene_set.values[gen.upper()]]), avg, siz,peaks , sep='\t', file = f) except: pass f.close()
def load_exon_sequence(bed, directory, genome_path): """Load the exon sequence from the the transcripts. Input BED format should contain: blockCount - The number of blocks (exons) in the BED line. blockSizes - A comma-separated list of the block sizes. blockStarts - A comma-separated list of block starts. see details: http://genome.ucsc.edu/FAQ/FAQformat#format1 Output: Each FASTA file represants a transcript and contains all the exons within the file. """ regionset = GenomicRegionSet("bed") regionset.read(bed) regionset.sort() genome = pysam.Fastafile(genome_path) try: if len(regionset.sequences[0].data.split("\t")) == 7: blockinfor = True no_exon = False except: blockinfor = False regionset.sequences.sort(key=lambda g: g.name) no_exon = True if blockinfor: for gr in regionset: if not gr.name: print("Error: For fetching exon sequences, please define the transcript name.") sys.exit() else: if not os.path.exists(directory): os.makedirs(directory) f = open(os.path.join(directory, gr.name+".fa"), "w") data = gr.data.split("\t") #print(len(data)) if len(data) == 7: #print(data) n = int(data[4]) blocks = [ int(b) for b in filter(None, data[5].split(",")) ] starts = [ int(s) for s in filter(None, data[6].split(",")) ] printstr = [] for i in range(n): start = gr.initial + starts[i] end = start + blocks[i] if no_exon and i == 0: ex = "" elif gr.orientation == "-": ex = "exon:"+str(n-i) else: ex = "exon:"+str(i+1) if gr.orientation == "-": seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna) seq = seq.reverse_complement() p = [ ">"+ " ".join([ gr.name, ex, "_".join(["REGION",gr.chrom, str(start),str(end), gr.orientation]) ]), seq ] printstr.append(p) else: p = [ ">"+ " ".join([gr.name, ex, "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]), genome.fetch(gr.chrom, start-1, end-1) ] printstr.append(p) if gr.orientation == "-": printstr = printstr[::-1] for i in range(n): print(printstr[i][0], file=f) print(printstr[i][1], file=f) else: print("Warning: The given regions have no block information, please try write_bed_blocks") f.close() else: pre_id = "" for gr in regionset: if not gr.name: gr.name = gr.toString() if pre_id == "": pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) elif gr.name == pre_id: z.add(gr) else: f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close() pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) # Last TX f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close()
class TestGenomicRegionSet(unittest.TestCase): def region_sets(self, listA, listB): """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """ self.setA = GenomicRegionSet('for Unit Test') for i in range(len(listA)): self.setA.add( GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2])) self.setB = GenomicRegionSet('for Unit Test') for i in range(len(listB)): self.setB.add( GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2])) def test_extend(self): """ Two empty sets A : none R : none """ self.region_sets([], []) self.setA.extend(100, 100) self.assertEqual(len(self.setA.sequences), 0) """ One region A : ----- R : --------- """ self.region_sets([['chr1', 5, 10]], []) result = self.setA result.extend(4, 4) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 14) """ Many region A : ----- ------ ----- ----- R : --------=--------- ------------------ """ self.region_sets([['chr1', 5, 10], ['chr1', 15, 20], ['chr1', 40, 50], ['chr1', 65, 75]], []) result = self.setA result.extend(5, 5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) """ Many region in different chromosome A : ----- ------ ----- ----- R : none """ self.region_sets([['chr1', 5, 10], ['chr2', 15, 20], ['chr3', 40, 50], ['chr4', 65, 75]], []) result = self.setA result.extend(5, 5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[0].chrom, 'chr1') self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[1].chrom, 'chr2') self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[2].chrom, 'chr3') self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) self.assertEqual(result[3].chrom, 'chr4') """ One region A : ----- R : --------- """ self.region_sets([['chr1', 100, 200]], []) result = self.setA result.extend(10, 10, percentage=True) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 90) self.assertEqual(result[0].final, 210) def test_sort(self): self.region_sets([['chr1', 15, 20], ['chr1', 40, 50], ['chr1', 65, 75], ['chr1', 5, 10]], []) self.setA.sort() def test_intersect(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ One empty set A : ----- B : none R : none """ self.region_sets([['chr1', 5, 10]], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : none B : ----- R : none """ self.region_sets([], [['chr1', 5, 10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No overlapping A : ------ --------- ------- B : ---- ------ ------ R : none """ self.region_sets([['chr1', 1, 5], ['chr1', 11, 20], ['chr1', 33, 38]], [['chr1', 7, 9], ['chr1', 20, 25], ['chr1', 26, 31]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ End-to-end attach A : ------ ------ B : ------ R : none """ self.region_sets([['chr1', 1, 5], ['chr1', 11, 20]], [['chr1', 5, 11]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No length attach A : . . B : . . R : none """ self.region_sets([['chr1', 2, 2], ['chr1', 20, 20]], [['chr1', 5, 5], ['chr1', 20, 20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Perfect overlapping A : ------ B : ------ R : ------ """ self.region_sets( [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650], ['chr1', 700, 750], ['chr1', 725, 800]], [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650], ['chr1', 700, 750], ['chr1', 725, 800]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP, rm_duplicates=True) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ One overlapping region A : ------ B : -------- R1: -- (overlap) R2: ------ (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 10]], [['chr1', 7, 20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two simple overlapping regions A : ------- -------- B : ------------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]], [['chr1', 7, 30]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 30) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two separately overlapping regions A : ------- -------- B : ----- -------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]], [['chr1', 7, 15], ['chr1', 30, 40]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 30) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Many various overlapping (mixed) A : ------------------ -------- --------- B : ---- ------- ------ ---------- R1: -- ------- -- ---- --- (overlap) R2: ------------------ -------- --------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 3, 30], ['chr1', 50, 60], ['chr1', 70, 85]], [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 27, 35], ['chr1', 55, 75]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 27) self.assertEqual(result[2].final, 30) self.assertEqual(result[3].initial, 55) self.assertEqual(result[3].final, 60) self.assertEqual(result[4].initial, 70) self.assertEqual(result[4].final, 75) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 85) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Different chromosomes A : chr1 ------- B : chr2 ------- R : none """ self.region_sets([['chr1', 1, 10]], [['chr2', 1, 10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Completely included overlapping A : --------------------------- B : ---- ------ ----------- R1: ---- ------ ------ (overlap) R2: --------------------------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 50]], [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : ---- ------ ----------- B : --------------------------- R1: ---- ------ ------ (overlap) R2: ---- ------ ----------- (original) R3: ---- ------ (comp_incl) """ self.region_sets([['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]], [['chr1', 1, 50]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 60) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) """ A : -------------- ------- ------ B : ----- ---------------- R1: ----- ------- (overlap) ---- R2: -------------- ------- (original) ------ R3: ------- (comp_incl) """ self.region_sets([['chr1', 1, 50], ['chr1', 20, 40], ['chr1', 70, 80]], [['chr1', 25, 45], ['chr1', 65, 95]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 25) self.assertEqual(result[0].final, 45) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[1].initial, 20) self.assertEqual(result[1].final, 40) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 70) self.assertEqual(result[0].final, 80) def test_closest(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.closest(self.setB) self.assertEqual(len(result), 0) # """ # One empty set # A : ----- # B : none # R : none # """ # self.region_sets([['chr1',5,10]], # []) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # A : none # B : ----- # R : none # """ # self.region_sets([], # [['chr1',5,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Overlapping within set # A : -----====----- # B : ---- # R : ---- # """ # self.region_sets([['chr1',1,10],['chr1',6,15]], # [['chr1',6,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # A : ---- # B : -----====----- # R : -----====----- # """ # self.region_sets([['chr1',6,10]], # [['chr1',1,10],['chr1',6,15]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # """ # No overlapping # A : ------ --------- ------- # B : ---- ------ ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]], # [['chr1',7,9],['chr1',20,25],['chr1',26,31]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # # self.assertEqual(result[0].initial, 20) # # self.assertEqual(result[0].final, 25) # """ # End-to-end attach # A : ------ ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20]], # [['chr1',5,11]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # # self.assertEqual(result[0].initial, 5) # # self.assertEqual(result[0].final, 11) # """ # Perfect overlapping # A : ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,10]], # [['chr1',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 1) # self.assertEqual(result[0].final, 10) # """ # One overlapping region # A : ------ # B : -------- # R : -------- # """ # self.region_sets([['chr1',1,10]], # [['chr1',7,20]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 20) # """ # Two simple overlapping regions # A : ------- -------- # B : ------------- # R : ------------- # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,30]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 30) # """ # Two separately overlapping regions # A : ------- -------- # B : ----- -------- # R : none # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,15],['chr1',30,40]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # Many various overlapping (mixed) # A : ------------------ -------- --------- # B : ---- ------- ------ ---------- # R : none # """ # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]], # [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 4) # """ # Different chromosomes # A : chr1 ------- # B : chr2 ------- # R : chr2 ------- # # """ # self.region_sets([['chr1',1,10]], # [['chr2',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Completely included overlapping # A : --------------------------- # B : ---- ------ ----------- # R : ---- ------ ----------- # """ # self.region_sets([['chr1',1,50]], # [['chr1',1,5],['chr1',10,19],['chr1',45,60]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # """ # A : ---- ------ ----------- # B : --------------------------- # R : none # """ # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]], # [['chr1',1,50]]) # result = self.setA.closest(self.setB) # self.assertEqual(result, False) # """ # A : ---- ------ --- # B : --- ----- # R : --- # """ # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]], # [['chr1',15,20],['chr1',55,65]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 15) # self.assertEqual(result[0].final, 20) def test_remove_duplicates(self): """ A : ===== ----- R : ----- ----- """ self.region_sets([['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A : =====--- ----- R : =====--- ----- """ self.region_sets([['chr1', 1, 10], ['chr1', 1, 15], ['chr1', 20, 25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 1) self.assertEqual(result[1].final, 15) self.assertEqual(result[2].initial, 20) self.assertEqual(result[2].final, 25) """ A : ===== ----- ------ ==== R : ----- ----- ------ ---- """ self.region_sets( [['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25], ['chr1', 30, 35], ['chr1', 40, 45], ['chr1', 40, 45]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 30) self.assertEqual(result[2].final, 35) self.assertEqual(result[3].initial, 40) self.assertEqual(result[3].final, 45) def test_window(self): """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 100 R : - only one base overlaps with extending A """ self.region_sets([['chr1', 200, 300]], [['chr1', 1, 101], ['chr1', 499, 550]]) result = self.setA.window(self.setB, adding_length=100) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 100) self.assertEqual(result[0].final, 101) """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 200 R : ------ - left-hand side is covered, and the right-hand side is only one base overlapped """ self.region_sets([['chr1', 200, 300]], [['chr1', 1, 101], ['chr1', 499, 550]]) result = self.setA.window(self.setB, adding_length=200) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) # GenomicRegion.extend will choose 1 rather than 0 self.assertEqual(result[0].final, 101) self.assertEqual(result[1].initial, 499) self.assertEqual(result[1].final, 500) """ A : ---- ---- B : -------- ---- window = 1000 (default) R : ---- ---- """ self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]], [['chr1', 1500, 2500], ['chr1', 5000, 5500]]) result = self.setA.window(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 2000) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) """ A : ---- ---- B : -------- ---- window = 2000 R : -------- ---- ---- ---- window = 100 R : none """ self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]], [['chr1', 1500, 2500], ['chr1', 5000, 5500]]) result = self.setA.window(self.setB, adding_length=2000) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1500) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) result = self.setA.window(self.setB, adding_length=100) self.assertEqual(len(result), 0) def test_subtract(self): """ A : none B : ------ R : none """ self.region_sets([], [['chr1', 6, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ------ B : none R : ------ """ self.region_sets([['chr1', 6, 15]], []) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 6) self.assertEqual(result[0].final, 15) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1', 1, 10]], [['chr1', 6, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1', 6, 15]], [['chr1', 1, 10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 10) self.assertEqual(result[0].final, 15) """ A : --- B : --------- R : none """ self.region_sets([['chr1', 6, 10]], [['chr1', 1, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : --------- B : --- R : --- --- """ self.region_sets([['chr1', 1, 15]], [['chr1', 6, 10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 15) """ A : ------ B : ------ R : none """ self.region_sets([['chr1', 6, 15]], [['chr1', 6, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ---------- ------ B : ---------- ---- R : ------- ------ """ self.region_sets([['chr1', 5, 30], ['chr1', 70, 85]], [['chr1', 20, 50], ['chr1', 100, 110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 5) self.assertEqual(result[0].final, 20) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 85) """ A : ------ ----- B : ------ R : ---- ----- """ self.region_sets([['chr1', 20, 30], ['chr1', 35, 55]], [['chr1', 10, 23], ['chr1', 100, 110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 23) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 35) self.assertEqual(result[1].final, 55) """ A : ch1 --------------------- ch2 ------------------------- B : ch1 ------ ch2 ------ R : ch1 -------- ------- ch2 ------------------- """ self.region_sets([['chr1', 0, 30000], ['chr2', 0, 35000]], [['chr1', 20000, 23000], ['chr2', 31000, 35000]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 20000) self.assertEqual(result[1].initial, 23000) self.assertEqual(result[1].final, 30000) self.assertEqual(result[2].initial, 0) self.assertEqual(result[2].final, 31000) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets([['chr1', 5, 1000]], [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140], ['chr1', 200, 240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 5) """ A : ----------------------- ------ ----- ----- ----------- B : --- --------- ---- ---- R : - ---- ------ ---- --- --- ---- --- """ self.region_sets([['chr1', 5, 100], ['chr1', 20, 40], ['chr1', 60, 80], ['chr1', 95, 150], ['chr1', 180, 220]], [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140], ['chr1', 200, 240]]) result = self.setA.subtract(self.setB) # print(result.sequences) self.assertEqual(len(result), 8) self.assertEqual(result[0].initial, 5) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets( [['chr1', 5, 1000], ['chr2', 5, 1000], ['chr4', 5, 1000]], [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140], ['chr1', 200, 240], ['chr2', 10, 15], ['chr2', 30, 70], ['chr2', 120, 140], ['chr2', 200, 240], ['chr4', 10, 15], ['chr4', 30, 70], ['chr4', 120, 140], ['chr4', 200, 240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 15) def test_merge(self): """ A : none R : none """ self.region_sets([], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 0) """ A : ----- ----- R : ----- ----- """ self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A1: ------------ ---- A2: ----- R : ------------ ---- """ self.region_sets([['chr1', 1, 30], ['chr1', 11, 20], ['chr1', 40, 50]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 40) self.assertEqual(result[1].final, 50) """ A1: -------- ---- A2: --------- R : ------------ ---- """ self.region_sets([['chr1', 1, 30], ['chr1', 20, 40], ['chr1', 50, 60]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 40) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) """ A : ======= R : ------- """ self.region_sets([['chr1', 1, 30], ['chr1', 1, 30]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) def test_cluster(self): """ Empty sets A : none R : none """ self.region_sets([], []) result = self.setA.cluster(10) self.assertEqual(len(result), 0) """ A : ------- R : ------- """ self.region_sets([['chr1', 1, 10]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ A : ----- ------ R : ----------- """ self.region_sets([['chr1', 1, 10], ['chr1', 10, 20]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 20) """ A : ----- ----- R1: ----- ----- R2: ------------ """ self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], []) result = self.setA.cluster(1) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(5) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(6) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 25) """ A : ---- ---- ---- ---- ---- R1: --------- ---- ---- ---- R2: --------------- ---- ---- R3: ---------------------- ---- R4: ------------------------------ R5: ------------------------------ """ self.region_sets([['chr1', 1, 10], ['chr1', 15, 25], ['chr1', 35, 45], ['chr1', 60, 70], ['chr1', 90, 100]], []) result = self.setA.cluster(6) self.assertEqual(len(result), 4) result = self.setA.cluster(11) self.assertEqual(len(result), 3) result = self.setA.cluster(16) self.assertEqual(len(result), 2) result = self.setA.cluster(21) self.assertEqual(len(result), 1) result = self.setA.cluster(26) self.assertEqual(len(result), 1) def test_flank(self): """ A : ----- R1: --- --- """ self.region_sets([['chr1', 60, 75]], []) result = self.setA.flank(10) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 50) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 85) """ A : ----- ---- R1: ----- ===== ---- """ self.region_sets([['chr1', 60, 75], ['chr1', 90, 100]], []) result = self.setA.flank(15) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 45) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 90) self.assertEqual(result[2].initial, 75) self.assertEqual(result[2].final, 90) self.assertEqual(result[3].initial, 100) self.assertEqual(result[3].final, 115) def test_jaccard(self): """ self --8-- ---10--- -4- y ---10--- ---10--- intersect -5- -4- similarity: ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )] = 9/33 """ self.region_sets( [['chr1', 50, 58], ['chr1', 70, 80], ['chr1', 90, 94]], [['chr1', 45, 55], ['chr1', 76, 86]]) result = self.setA.jaccard(self.setB) self.assertEqual(result, 9 / 33) def test_get_genome_data(self): """hg19""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19") self.assertEqual(len(result), 23) """hg19, with Mitochondria chromosome""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19", chrom_M=True) self.assertEqual(len(result), 24) def test_random_regions(self): self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=False) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=False) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=True) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=True) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False, chrom_M=True) result.sort()
def mode_1(exp_matrix): for region in exp_matrix.get_regionsets(): region_set = GenomicRegionSet("") _, _, mappedGenes, _, _ = region_set.filter_by_gene_association(region.fileName, None, gene_file, genome_file, threshDist=50000) print('#number of mapped genes:', mappedGenes) print(region.name+"\t"+("\t".join(region_set.genes)))
def test_filter_tts(self): txp = RNADNAInteractionSet(organism="hg19", filename=sample_txp) g = GenomicRegionSet("g") s = GenomicRegion(chrom="chr2", initial=74000000, final=75000000) g.add(s) result = txp.count_tts(g)
from fisher import pvalue back = False designFile = sys.argv[1] anotationPath = sys.argv[2] genomeFile = anotationPath + "chrom.sizes" geneFile = anotationPath + "association_file.bed" exps = ExperimentalMatrix() exps.read(designFile) beds = [] geneLists = [] #this should be improved bedGenes = GenomicRegionSet(geneFile) bedGenes.read_bed(geneFile) allgenes = [] for r in bedGenes: allgenes.append(r.name) allgenes = list(set(allgenes)) genesets = exps.get_genesets() if len(sys.argv) > 3: back = True backGroundPeaks = sys.argv[3] backBed = GenomicRegionSet("BACK") backBed.read_bed(backGroundPeaks) backBed = GenomicRegionSet("BACK")
from fisher import pvalue back=False designFile = sys.argv[1] anotationPath = sys.argv[2] genomeFile=anotationPath+"chrom.sizes" geneFile=anotationPath+"association_file.bed" exps=ExperimentalMatrix() exps.read(designFile) beds=[] geneLists=[] #this should be improved bedGenes = GenomicRegionSet(geneFile) bedGenes.read_bed(geneFile) allgenes=[] for r in bedGenes: allgenes.append(r.name) allgenes=list(set(allgenes)) genesets=exps.get_genesets() if len(sys.argv) > 3: back=True backGroundPeaks = sys.argv[3] backBed=GenomicRegionSet("BACK") backBed.read_bed(backGroundPeaks)
################################################################################## parser = argparse.ArgumentParser(description='Replace TCONs in BED file by assoicated gene names', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files") parser.add_argument('-output', type=str, help="Define the output directory") parser.add_argument('-organism', type=str, help="Define the organism") args = parser.parse_args() genome = GenomeData(args.organism) if os.path.isfile(args.bed): regionset = GenomicRegionSet("bed") regionset.read_bed(args.bed) gr = regionset.gene_association(organism=args.organism, promoterLength=1000, threshDist=500000, show_dis=True) regionset.replace_region_name(gr,combine=True) regionset.write_bed(args.output) elif os.path.isdir(args.bed): if not os.path.exists(args.output): os.makedirs(args.output) for root, dirnames, filenames in os.walk(args.bed): for filename in filenames: if ".bed" in filename: print(filename)
################################################################################## parser = argparse.ArgumentParser(description='Replace TCONs in BED file by assoicated gene names', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files") parser.add_argument('-output', type=str, help="Define the output directory") parser.add_argument('-organism', type=str, help="Define the organism") args = parser.parse_args() genome = GenomeData(args.organism) if os.path.isfile(args.bed): regionset = GenomicRegionSet("bed") regionset.read(args.bed) gr = regionset.gene_association(organism=args.organism, promoter_length=1000, thresh_dist=500000, show_dis=True) regionset.replace_region_name(gr,combine=True) regionset.write(args.output) elif os.path.isdir(args.bed): if not os.path.exists(args.output): os.makedirs(args.output) for root, dirnames, filenames in os.walk(args.bed): for filename in filenames: if ".bed" in filename: print(filename)
def get_promoters(self, promoter_length=1000, tss=0, gene_set=None, unmaplist=False, variants=False, gene_id=False, regiondata=False): """ Gets promoters of genes given a specific promoter length. It returns a GenomicRegionSet with such promoters. The ID of each gene will be put in the NAME field of each GenomicRegion. Each promoter includes also the coordinate of the 5' base pair, therefore each promoter actual length is promoter_length+1. *Keyword arguments:* - promoter_length -- The length of the promoter region. - gene_set -- A set of genes to narrow the search. - unmaplist -- If True than also return the unmappable genes list (default = False). *Return:* - result_grs -- A GenomicRegionSet containing the promoters. - unmapped_gene_list -- A list of genes that could not be mapped to an ENSEMBL ID. """ # Fetching gene names mapped_gene_list = None unmapped_gene_list = None if gene_set: mapped_gene_list, unmapped_gene_list, mapping_dict = self.fix_gene_names( gene_set, output_dict=True) # Fetching genes if not variants: target = "gene" else: target = "transcript" if gene_set: query_dictionary = { self.GeneField.FEATURE_TYPE: target, self.GeneField.GENE_ID: mapped_gene_list } else: query_dictionary = {self.GeneField.FEATURE_TYPE: target} query_annset = self.get(query_dictionary) # Creating GenomicRegionSet result_grs = GenomicRegionSet("promoters") for e in query_annset.gene_list: gr = e[self.GeneField.GENOMIC_REGION] if gr.orientation == "+": gr.final = gr.initial + 1 + tss gr.initial = gr.initial - promoter_length else: gr.initial = gr.final - 1 - tss gr.final = gr.initial + promoter_length + 1 if gene_set: try: gr.name = mapping_dict[e[self.GeneField.GENE_ID]] except: gr.name = e[self.GeneField.GENE_ID] elif gene_id: gr.name = e[self.GeneField.GENE_ID] else: gr.name = e[self.GeneField.GENE_NAMES] if gene_set and regiondata: gr.data = gene_set.values[gr.name] result_grs.add(gr) if unmaplist: return result_grs, unmapped_gene_list else: return result_grs
from rgt.GenomicRegionSet import * from rgt.ExperimentalMatrix import * #from fisher import pvalue import scipy.stats outdir = "" back = False designFile = sys.argv[1] genomeName = sys.argv[2] geneFile = sys.argv[3] randomize = int(sys.argv[4]) backGroundPeaks = False if len(sys.argv) > 5: backGroundPeaksName = sys.argv[6] backBed = GenomicRegionSet("BACK") backBed.read_bed(backGroundPeaksName) backGroundPeaks = True distance = 50000 if len(sys.argv) > 6: distance = len(sys.argv[6]) if len(sys.argv) > 7: outdir = sys.argv[7] #genomeFile=anotationPath+"chrom.sizes" #geneFile=anotationPath+"association_file.bed" exps = ExperimentalMatrix() exps.read(designFile)
""" Updated on 22 May 2014 by Joseph """ ############################# Parameters ############################## parser = argparse.ArgumentParser(description='Return the random sequences according to the given parameters.') parser.add_argument('-o','-organism', default= "hg19", help='Define the organism. Default: hg19') parser.add_argument('-l','-length', type=int, help='Define the length of each sequence.') parser.add_argument('-n','-number', type=int, help='Define the number of random regions.') parser.add_argument('-f','-filter', default=None, help='Given the path to the BED file as the filter.') args = parser.parse_args() # Setup the entries region = GenomicRegion("sample", initial=0, final=args.l) template = GenomicRegionSet("tamplate") template.add(region) if not os.path.exists(bed_dir): os.makedirs(bed_dir) # Random region result = template.random_regions(organism= "hg19", total_size=args.n, multiply_factor=0, overlap_result=True, overlap_input=True, chrom_X=False, chrom_M=False, filter_path=args.f) result.write(os.path.join(bed_dir, "00total.bed")) chrom = GenomicRegionSet("chrom") chrom.get_genome_data(organism=args.o, chrom_X=False, chrom_M=False) chrom_list = [] for r in chrom.sequences: chrom_list.append(r.chrom)
# experimental_matrix_file = "/home/manuel/workspace/cluster_p/THOR/exp/exp23_macs2_payton/1" # gene_exp = "/home/manuel/workspace/cluster_p/allhoff/project_THOR/data/payton/gene_expression/CCmean.data" # annotation_path = "/home/manuel/workspace/cluster_h/rgtdata/hg19/" # outputdir = "/home/manuel/test/" exps = ExperimentalMatrix() exps.read(experimental_matrix_file) regionsets = exps.get_regionsets() genome_file = annotation_path + "/chrom.sizes" gene_file = annotation_path + "/association_file.bed" genes = GeneSet("Expression") genes.read_expression(gene_exp) for region in regionsets: bedNew = GenomicRegionSet("") [degenes, de_peak_genes, mappedGenes, totalPeaks, regionsToGenes] \ = bedNew.filter_by_gene_association_old(region.fileName, genes.genes, gene_file, genome_file) [ct, labels] = averageExpression(region, genes, regionsToGenes) aux = region.fileName.split("/") fileName = aux[-1] fileName = fileName.split(".") output(genes.cond, labels, ct, outputdir + "/" + fileName[0] + ".txt")
import unittest from rgt.GenomicRegionSet import * from rgt.CoverageSet import CoverageSet regions = GenomicRegionSet("test") regions.add(GenomicRegion("chr1", 10000, 11000, "+")) regions.add(GenomicRegion("chr1", 20000, 21000, "-")) cov = CoverageSet("coverage", regions) bamfile = "/projects/lncRNA/local/cardio/total_rna/bam/d4_1.bam" bedfile = "~/rgtdata/hg38/genes_hg38.bed" class CoverageSet_Test(unittest.TestCase): def coverage_from_genomicset(self): cov.coverage_from_genomicset(bamfile) print(cov.coverage) self.assertEqual(cov.coverage, 4)
from rgt.GenomicRegionSet import * from rgt.ExperimentalMatrix import * #from fisher import pvalue import scipy.stats outdir="" back=False designFile = sys.argv[1] anotationPath = sys.argv[2] randomize = int(sys.argv[3]) backGroundPeaks=False if len(sys.argv) > 4: backGroundPeaksName = sys.argv[4] backBed=GenomicRegionSet("BACK") backBed.read_bed(backGroundPeaksName) backGroundPeaks=True distance=50000 if len(sys.argv) > 5: distance=len(sys.argv[5]) if len(sys.argv) > 6: outdir=sys.argv[6] genomeFile=anotationPath+"chrom.sizes" geneFile=anotationPath+"association_file.bed" exps=ExperimentalMatrix()
from rgt.ExperimentalMatrix import * #from fisher import pvalue import scipy.stats outdir="" back=False designFile = sys.argv[1] genomeName = sys.argv[2] geneFile = sys.argv[3] randomize = int(sys.argv[4]) backGroundPeaks=False if len(sys.argv) > 5: backGroundPeaksName = sys.argv[6] backBed=GenomicRegionSet("BACK") backBed.read(backGroundPeaksName) backGroundPeaks=True distance=50000 if len(sys.argv) > 6: distance=len(sys.argv[6]) if len(sys.argv) > 7: outdir=sys.argv[7] #genomeFile=anotationPath+"chrom.sizes" #geneFile=anotationPath+"association_file.bed" exps=ExperimentalMatrix()