class ScrapeEnsembl(): ''' ''' def __init__(self, query, hg_version): self.query = query.replace("chr","") self.hg_version = ScrapeEnsembl.genome.get(hg_version) # convert to ensembl release self.hg = EnsemblRelease(self.hg_version) # convert to ensembl release object genome = {"hg19": 75, "hg38": 83} def get_gene_info(self): ''' Get the gene information at a given genomic position ''' # check if the input is a genomic position or genomic range if re.search(r"[-:]", self.query) and self.query.replace(":","").isdigit(): chrom = int(self.query.split(":")[0]) pos = int(self.query.split(":")[1]) gene_name = self.hg.gene_names_at_locus(contig=chrom, position=pos) if not gene_name: msg = " ".join(("No gene found at",self.query,"for genome version", str(self.hg_version))) return msg gene_info = self.hg.genes_by_name(gene_name[0]) # gene_info[0].loaction doesn't work, hence the mess below gene_location = str(gene_info[0]).split(",")[-1][:-1].split("=")[1] gene_info = (gene_info[0].name, gene_info[0].id, gene_info[0].biotype, gene_location) return(gene_info) def get_canonical_transcript(self, gene_name): ''' Determine and return the canonical transcript of the given gene ''' all_transcripts = self.hg.transcript_ids_of_gene_name(gene_name) all_transcript_details = [self.hg.transcript_by_id(x) for x in all_transcripts] protein_coding_transcripts = [] for x in all_transcript_details: split_transcript_info = re.split(r"[=,]",str(x)) transcript = split_transcript_info[1] transcript_type = split_transcript_info[9] location = split_transcript_info[-1][:-1] start = re.split(r"[:-]", location)[1] stop = re.split(r"[:-]", location)[2] size = int(stop) - int(start) if transcript_type == "protein_coding": protein_coding_transcripts.append((size,transcript,transcript_type)) # sort by size and return the largest protein coding transcript if protein_coding_transcripts: canonical_transcript = sorted(protein_coding_transcripts)[-1][1] return canonical_transcript
def make_the_gene_with_exon_and_map(chr, gene_name, ax, exon_lib, x, y, locus, frequency, width=30.0, height=5.0, scale=1.0): genome = EnsemblRelease(75) genes = genome.genes_by_name(gene_name) gene = genes[0] gene_start = gene.start gene_end = gene.end gene_length = float(gene_end - gene_start) # draw the gene ax.add_patch( patches.Rectangle( (x, y - height / 2), width, height, alpha=0.5, color="dodgerblue", linewidth=0, )) # find the exon ids exon_id = genome.exon_ids_of_gene_name(gene_name) # plot the gene with exons for i in exon_id: position = exon_lib[i] exon_start = int(position[0]) exon_end = int(position[1]) exon_length = exon_end - exon_start plot_length = exon_length / gene_length * width plot_x = (exon_start - gene_start) / gene_length * width ax.add_patch( patches.Rectangle( (x + plot_x, y - height / 2 - 0.5 * scale), plot_length, height + 1 * scale, color="black", alpha=0.4, linewidth=0, )) # map the read to the plot for i in range(len(locus)): plot_x = (locus[0] - gene_start) / gene_length * width if frequency[i] < 0.0001: bar_length = 1 else: bar_length = abs(math.log(frequency[i] / 0.01) * 2) ax.plot( (x + plot_x, x + plot_x), (y + height / 2 + scale * 0.5, y + height / 2 + scale * 0.5 + bar_length), color="darkblue", alpha=0.5, linewidth=2, ) # gene name at the left ax.text(x - 0.5, y, gene_name + '\n' + chr, horizontalalignment='right', verticalalignment='center', fontsize=25, fontweight="bold") # gene start ax.text( x, y - height / 2 - 0.8 * scale, gene_start, horizontalalignment='center', verticalalignment='top', fontsize=12, ) # gene end ax.text( x + width, y - height / 2 - 0.8 * scale, gene_end, horizontalalignment='center', verticalalignment='top', fontsize=12, )
def plot_the_genes_mapping(lib, lib_char_y, ax, total_read, length=50.0, x_default=10.0, frequency_parameter=0.05): #parse the data l_gene = [] for i in lib: chr = "chr" + i[0] gene_name = i[1] if gene_name == "non-coding": for j in lib[i]: frequency = [ float(j[1]) * 100 / total_read, ] position = j[2] l_gene.append((chr, None, frequency, position)) else: frequency = [float(x[1]) * 100 / total_read for x in lib[i]] position = [int(x[2]) for x in lib[i]] l_gene.append((chr, gene_name, frequency, position)) # draw the genes for i in l_gene: gene_name = i[1] gene_y = lib_char_y[i[0]] frequency = sum(i[2]) chr_length = chr_relative_size[i[0]] * chr_relative_size["chr1_length"] #find the x coordinate of the gene in the corresponding chrs if gene_name != None: genome = EnsemblRelease(75) gene = genome.genes_by_name(gene_name) gene = gene[0] gene_start = gene.start gene_end = gene.end gene_position_in_chr = (gene_start + gene_end) / 2 gene_x = length * chr_relative_size[ i[0]] * gene_position_in_chr / chr_length else: gene_x = length * chr_relative_size[i[0]] * i[3] / chr_length # draw the genes (size of circle stand for the freqency if frequency < 0.05: radius = 0.5 else: radius = abs(math.log(frequency / frequency_parameter)) if gene_name == None: color = "deeppink" else: color = "dodgerblue" # gene with frequency related radius ax.add_patch( patches.Circle( (gene_x + x_default, gene_y), radius, color=color, alpha=0.5, linewidth=0, )) # gene position ax.add_patch( patches.Circle( (gene_x + x_default, gene_y), 0.5, color=color, linewidth=0.5, edgecolor="black", )) if gene_name != None: ax.text( gene_x + x_default, gene_y - radius - 0.8, gene_name, horizontalalignment='center', verticalalignment='center', fontsize=18, fontweight="bold", ) if frequency < 0.0001: number = "%.2e" % float(frequency) else: number = "%.4f" % float(frequency) ax.text( gene_x + x_default, gene_y + radius + 0.5, number, horizontalalignment='center', verticalalignment='center', fontsize=12, ) return l_gene
def genes_by_name(*args, **kwargs): genome = EnsemblRelease(ENSEMBL_RELEASE_VERSION) return genome.genes_by_name(*args, **kwargs)