class ScrapeEnsembl(): ''' ''' def __init__(self, query, hg_version): self.query = query.replace("chr","") self.hg_version = ScrapeEnsembl.genome.get(hg_version) # convert to ensembl release self.hg = EnsemblRelease(self.hg_version) # convert to ensembl release object genome = {"hg19": 75, "hg38": 83} def get_gene_info(self): ''' Get the gene information at a given genomic position ''' # check if the input is a genomic position or genomic range if re.search(r"[-:]", self.query) and self.query.replace(":","").isdigit(): chrom = int(self.query.split(":")[0]) pos = int(self.query.split(":")[1]) gene_name = self.hg.gene_names_at_locus(contig=chrom, position=pos) if not gene_name: msg = " ".join(("No gene found at",self.query,"for genome version", str(self.hg_version))) return msg gene_info = self.hg.genes_by_name(gene_name[0]) # gene_info[0].loaction doesn't work, hence the mess below gene_location = str(gene_info[0]).split(",")[-1][:-1].split("=")[1] gene_info = (gene_info[0].name, gene_info[0].id, gene_info[0].biotype, gene_location) return(gene_info) def get_canonical_transcript(self, gene_name): ''' Determine and return the canonical transcript of the given gene ''' all_transcripts = self.hg.transcript_ids_of_gene_name(gene_name) all_transcript_details = [self.hg.transcript_by_id(x) for x in all_transcripts] protein_coding_transcripts = [] for x in all_transcript_details: split_transcript_info = re.split(r"[=,]",str(x)) transcript = split_transcript_info[1] transcript_type = split_transcript_info[9] location = split_transcript_info[-1][:-1] start = re.split(r"[:-]", location)[1] stop = re.split(r"[:-]", location)[2] size = int(stop) - int(start) if transcript_type == "protein_coding": protein_coding_transcripts.append((size,transcript,transcript_type)) # sort by size and return the largest protein coding transcript if protein_coding_transcripts: canonical_transcript = sorted(protein_coding_transcripts)[-1][1] return canonical_transcript