def main(opts): # load ensembl db data = EnsemblRelease(95) # read in fusion file df = pd.read_csv(opts['input'], sep='\t') output_list = [] for ix, row in df.iterrows(): # extract gene / tx gene5 = row["5'_gene"] gene3 = row["3'_gene"] tx_id5 = row["5'_transcript"] tx_id3 = row["3'_transcript"] # fetch prot sequence tx5 = data.transcript_by_id(tx_id5) tx3 = data.transcript_by_id(tx_id3) prot5 = tx5.protein_id prot3 = tx3.protein_id prot_seq5 = tx5.protein_sequence prot_seq3 = tx3.protein_sequence # append output output_list.append([gene5, tx_id5, prot5, prot_seq5]) output_list.append([gene3, tx_id3, prot3, prot_seq3]) # save output output_df = pd.DataFrame(output_list, columns=['gene', 'transcript_id', 'protein_id', 'protein_sequence']) output_df.drop_duplicates(subset=['gene', 'transcript_id', 'protein_id']).to_csv(opts['output'], sep='\t', index=False)
def random_variants(count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] while len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset(base1_genomic_position) try: seq = transcript.sequence except ValueError as e: logging.warn(e) # can't get sequence for non-coding transcripts continue ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant(transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) return VariantCollection(variants)
class ScrapeEnsembl(): ''' ''' def __init__(self, query, hg_version): self.query = query.replace("chr","") self.hg_version = ScrapeEnsembl.genome.get(hg_version) # convert to ensembl release self.hg = EnsemblRelease(self.hg_version) # convert to ensembl release object genome = {"hg19": 75, "hg38": 83} def get_gene_info(self): ''' Get the gene information at a given genomic position ''' # check if the input is a genomic position or genomic range if re.search(r"[-:]", self.query) and self.query.replace(":","").isdigit(): chrom = int(self.query.split(":")[0]) pos = int(self.query.split(":")[1]) gene_name = self.hg.gene_names_at_locus(contig=chrom, position=pos) if not gene_name: msg = " ".join(("No gene found at",self.query,"for genome version", str(self.hg_version))) return msg gene_info = self.hg.genes_by_name(gene_name[0]) # gene_info[0].loaction doesn't work, hence the mess below gene_location = str(gene_info[0]).split(",")[-1][:-1].split("=")[1] gene_info = (gene_info[0].name, gene_info[0].id, gene_info[0].biotype, gene_location) return(gene_info) def get_canonical_transcript(self, gene_name): ''' Determine and return the canonical transcript of the given gene ''' all_transcripts = self.hg.transcript_ids_of_gene_name(gene_name) all_transcript_details = [self.hg.transcript_by_id(x) for x in all_transcripts] protein_coding_transcripts = [] for x in all_transcript_details: split_transcript_info = re.split(r"[=,]",str(x)) transcript = split_transcript_info[1] transcript_type = split_transcript_info[9] location = split_transcript_info[-1][:-1] start = re.split(r"[:-]", location)[1] stop = re.split(r"[:-]", location)[2] size = int(stop) - int(start) if transcript_type == "protein_coding": protein_coding_transcripts.append((size,transcript,transcript_type)) # sort by size and return the largest protein coding transcript if protein_coding_transcripts: canonical_transcript = sorted(protein_coding_transcripts)[-1][1] return canonical_transcript
def random_variants(count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] # we should finish way before this loop is over but just in case # something is wrong with PyEnsembl we want to avoid an infinite loop for _ in range(count * 100): if len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset( base1_genomic_position) seq = transcript.sequence ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant(transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) else: return VariantCollection(variants) raise ValueError(("Unable to generate %d random variants, " "there may be a problem with PyEnsembl") % count)
class EnsemblAnnotation(object): """ Class for building an annotation file for MAVIS in json format. Args: species (str): species of interest release (int): Ensembl release to use output (str): path to output file best_file (str): path to file of "best transcripts" alias_file (str): path to file with gene aliases """ def __init__(self, release, species, output, best_file=None, alias_file=None, custom_cache=None): self.annotation = {} self.custom_cache = custom_cache self.cache_prefix = None self.gen_time = get_date() self.release = release self.species = species self.output = output self.best_file = best_file self.alias_file = alias_file if self.alias_file: self.alias = parse_alias_file(self.alias_file) else: self.alias = defaultdict(set) self.data = EnsemblRelease(release, species) self.download_pyensembl_cache() self.get_domain_cache() if self.best_file: self.best = parse_best_file(self.best_file) else: self.best = self.choose_best_transcripts() self.build_json() def download_pyensembl_cache(self): """ Method download the pyensembl cache files for this release if not already there. Args: data (EnsemblRelease): pyensembl object for the release info custom_cache (str): path to cirectory to cache pyensembl files """ if self.custom_cache: os.environ["PYENSEMBL_CACHE_DIR"] = self.custom_cache self.data.download() self.data.index() self.cache_prefix = self.data.gtf_path.split("gtf.gz")[0] def get_domain_cache(self): global DOMAIN_CACHE_PATH DOMAIN_CACHE_PATH = self.cache_prefix + "domain.tsv" parse_cached_domains() def get_genes(self, eid): """ Method parse gene info in the EnsemblRelease into json format. Args: eid (str): Ensembl gene ID Returns: dict: gene info formatted for json """ gene = self.data.gene_by_id(eid) result = { "name": str(gene.gene_id), "chr": str(gene.contig), "start": int(gene.start), "end": int(gene.end), "strand": str(gene.strand), "aliases": [str(gene.gene_name)] + list(self.alias[gene.gene_id]), "transcripts": [], } return result def get_transcripts(self, eid): """ Method parse transcript info in the EnsemblRelease into json format. Ignore non-coding transcripts. Args: eid (str): Ensembl transcript ID Returns: dict: transcript info formatted for json """ transcript = self.data.transcript_by_id(eid) protein_id = transcript.protein_id if not protein_id: return None result = { "name": str(transcript.transcript_id), "start": int(transcript.start), "end": int(transcript.end), "aliases": [str(transcript.transcript_name)], "is_best_transcript": str(transcript.transcript_id) in self.best, "protein_id": transcript.protein_id, "exons": [], "domains": [], } # start/end are absolute genomic positions, so calculate positions relative to the mRNA start cpos = transcript.coding_sequence_position_ranges if transcript.strand in ("+", "1"): result["cdna_coding_start"] = transcript.spliced_offset( cpos[0][0]) + 1 result["cdna_coding_end"] = transcript.spliced_offset( cpos[-1][1]) + 1 elif transcript.strand in ("-", "-1"): result["cdna_coding_start"] = transcript.spliced_offset( cpos[0][1]) + 1 result["cdna_coding_end"] = transcript.spliced_offset( cpos[-1][0]) + 1 return result def get_exons(self, eid): """ Method parse exon info in the EnsemblRelease into json format. Args: eid (str): Ensembl exon ID Returns: dict: exon info formatted for json """ exon = self.data.exon_by_id(eid) result = { "name": str(exon.exon_id), "start": int(exon.start), "end": int(exon.end) } return result @cached_domains def get_domains(self, eid): """ Method request domain info from Ensembl and parse into json format. Args: eid (str): Ensembl protein ID Returns: list: a list of domains formatted for json """ temp = {} protein = request_ensembl_protein(eid) for domain in protein: name = str(domain["id"]) desc = (str(domain["description"]).replace('"', "").replace("'", "") ) # quotes causing errors when mavis loads json region = {"start": int(domain["start"]), "end": int(domain["end"])} if desc == "": continue if name in temp: temp[name]["regions"].append(region) else: temp[name] = {"name": name, "desc": desc, "regions": [region]} domain_list = list(temp.values()) return domain_list def build_json(self): """ Method compile a json object for MAVIS of all protein coding genes and associated info for the indicated species. Returns: dict: a json-formatted set of annotations for use with MAVIS """ count = {"gene": 0, "transcript": 0, "non_coding": 0} self.annotation["script"] = SCRIPT self.annotation["script_version"] = VERSION self.annotation["gene_alias_file"] = self.alias_file self.annotation["best_transcript_file"] = self.best_file self.annotation["ensembl_version"] = self.release self.annotation["generation_time"] = self.gen_time self.annotation["genes"] = [] gene_ids = self.data.gene_ids() for index, gid in enumerate(gene_ids): print("{}/{} genes".format(index, len(gene_ids))) gened = self.get_genes(gid) count["gene"] += 1 for tid in self.data.transcript_ids_of_gene_id(gid): transd = self.get_transcripts(tid) count["transcript"] += 1 if transd: for eid in self.data.exon_ids_of_transcript_id(tid): exond = self.get_exons(eid) transd["exons"].append(exond) domains = self.get_domains(transd["protein_id"]) transd["domains"] = domains gened["transcripts"].append(transd) else: count["non_coding"] += 1 if gened["transcripts"] != []: self.annotation["genes"].append(gened) print( "{gene:,} genes, {transcript:,} transcripts ({non_coding:,} non-coding transcripts, ignored)" .format(**count)) return self.annotation def dump_json(self): """ Method dump the annotations in json-format to the specified output file. """ with open(self.output, "w") as fh: json.dump(self.annotation, fh) def delete_cache(self): """ Method delete both the pyensembl and domain cache files. """ for cache_file in glob(self.cache_prefix + "*"): print("Removing cache file", cache_file) os.remove(cache_file) def choose_best_transcripts(self): """ Select a canonical transcript for each human gene using Ensembl rules. For human, the canonical transcript for a gene is set according to the following hierarchy: - 1. Longest CCDS translation with no stop codons. - 2. If no (1), choose the longest Ensembl/Havana merged translation with no stop codons. - 3. If no (2), choose the longest translation with no stop codons. - 4. If no translation, choose the longest non-protein-coding transcript. See: http://uswest.ensembl.org/Help/Glossary?id=346 Returns: str: canonical Ensembl transcript ID (if any) """ def longest_ccds(transcripts): """Longest CCDS translation with no stop codons.""" longest = None longest_len = 0 for t in transcripts: if t.is_protein_coding: if len(t.protein_sequence) > longest_len: longest = t longest_len = len(t.protein_sequence) return longest def longest_translation(transcript): """Longest translation with no stop codons.""" longest = None longest_len = 0 for t in transcripts: if t.contains_start_codon: start = t.start_codon_positions[0] if t.contains_stop_codon: stop = t.stop_codon_positions[2] else: stop = t.end if stop - start > longest_len: longest = t longest_len = stop - start return longest def longest_transcript(transcripts): """Longest transcript.""" longest = None longest_len = 0 for t in transcripts: if t.end - t.start > longest_len: longest = t longest_len = t.end - t.start return longest best = set() # Ensembl rules for canonical transcripts only apply to humans species = self.data.species.latin_name if species != "homo_sapiens": print( "Unable to choose canonical transcripts for {}. You can specify canonical transcripts with '--best-transcript-file'" .format(species)) return best else: print("Selecting a canoncial transcript for each gene") for gene_id in self.data.gene_ids(): transcripts = [ self.data.transcript_by_id(transcript_id) for transcript_id in self.data.transcript_ids_of_gene_id(gene_id) ] selected = (longest_ccds(transcripts) or longest_translation(transcripts) or longest_transcript(transcripts)) if selected: best.add(selected.transcript_id) return best
mstrg_list = [] mstrg_list.append(g_id_fresh) else: mstrg_list.append(g_id_fresh) ## Check if our gene id is already in the MSTRG tracker if g_id_fresh in mstrg_list: ## If subsequent entries for the MSTRG gene have an ensembl transcript, print out the correct info if "ENS" in t_id: mstrg = g_id_fresh.split("\"")[1] g_name = g_name.split("\"")[1] t_id = t_id.split("\"")[1] ensembl = data.transcript_by_id(t_id) ensembl_id = ensembl.gene_id ensembl_name = ensembl.transcript_name print(mstrg, ensembl_id, t_id, g_name, sep="\t") ## kick future genes with this id out of the loop g_id.append(g_id_fresh) ## and squash the mstrg list mstrg_list = [] ## if no ENS id is found, save the info for the next round elif "MSTRG" in t_id: line2 = line # if the id hasn't been found yet, and is NOT in the mstrg_list, then: else:
def random_variants( count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] while len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset(base1_genomic_position) try: seq = transcript.sequence except ValueError as e: logging.warn(e) # can't get sequence for non-coding transcripts continue ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant( transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) return VariantCollection(variants)
def random_variants( count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] # we should finish way before this loop is over but just in case # something is wrong with PyEnsembl we want to avoid an infinite loop for _ in range(count * 100): if len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset(base1_genomic_position) seq = transcript.sequence ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant( transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) else: return VariantCollection(variants) raise ValueError( ("Unable to generate %d random variants, " "there may be a problem with PyEnsembl") % count)
class LlamaEnsembl(object): """ Ensembl tools """ def __init__(self, genome='hg19'): if genome == 'hg19': self.version = 75 self.rest_url = "http://grch37.rest.ensembl.org" else: self.version = 77 self.rest_url = "http://rest.ensembl.org" self.db = EnsemblRelease(self.version) def rest_call(self, ext, data=None): if data: headers = { "Content-Type": "application/json", "Accept": "application/json" } r = requests.post(self.rest_url + ext, headers=headers, data=data) else: headers = {"Content-Type": "application/json"} r = requests.get(self.rest_url + ext, headers=headers) if not r.ok: r.raise_for_status() sys.exit() decoded = r.json() # print(repr(decoded)) return decoded def load_ensembl_ref(self, rid=None): """ Download, load, and index ensembl data """ self.db.download(self.version) self.db.index() if rid is not None: return self.db.transcript_by_id(rid) else: return None def get_exon_numbers(self, gene): """ This creates exon areas from the biggest transcript """ dct = {'start': [], 'id': [], 'stop': [], 'transcript': []} gene_id = self.db.gene_ids_of_gene_name(gene)[0] transcripts = self.db.transcript_ids_of_gene_id(gene_id) longest = 0 e = None for trans in transcripts: tsc = self.db.exon_ids_of_transcript_id(trans) tsize = len(tsc) if tsize > longest: longest = tsize e = tsc longest_transcript = trans for exid in e: exon = self.db.exon_by_id(exid) dct['start'].append(exon.start) dct['stop'].append(exon.end) dct['id'].append(exid) dct['transcript'].append(longest_transcript) df = pd.DataFrame(dct) df['number'] = df.index + 1 return df def get_genes(self, chrom, start, stop): if isinstance(chrom, str): chrom = chrom.replace('chr', '') return [ gobj.gene_name for gobj in self.db.genes_at_locus(chrom, start, stop) ] def get_gene_pos(self, gene): gene_id = self.db.gene_ids_of_gene_name(gene)[0] result = self.db.gene_by_id(gene_id) return result.contig, result.start, result.end # Rest client calls def get_rsids(self, rsids): ext = "/variation/homo_sapiens" data = {"ids": rsids} return self.rest_call(ext, json.dumps(data)) def get_cds_region(self, transcript, position): """ get location of variant to """ ext = "/variation/human/{}:{}?".format(transcript, position) try: mappings = self.rest_call(ext)['mappings'][0] except requests.exceptions.HTTPError: return '', '', '' return mappings['seq_region_name'], mappings['start'], mappings['end'] def parse_ref_exons(self, chrom, start, stop, gene=None, tx_col=None): """ Return fasta reference with only the sequences needed""" ens_db = self.db if isinstance(chrom, str): chrom = chrom.replace('chr', '') try: exons = ens_db.exons_at_locus(chrom, start, stop) except ValueError as e: # Load pyensembl db raise e if not len(exons): return '', '' exon_numbers = self.get_exon_numbers(exons[0].gene_name) transcript = exon_numbers['transcript'].values[0] trx_exons = [] for ex in exons: nrow = exon_numbers[exon_numbers['id'] == ex.exon_id] if nrow.shape[0] > 0: trx_exons.extend(nrow['number'].values) return transcript, ','.join([str(number) for number in trx_exons]) # Annotate DataFrames def annotate_dataframe(self, df, chrom_col='CHROM', start_col='START', end_col='END', gene_col=None, tx_col=None): genes = [] exons = [] transcripts = [] for i, row in df.iterrows(): genes_row = self.get_genes(row[chrom_col], row[start_col], row[end_col]) if gene_col: if row[gene_col] in genes_row: genes_row = [row[gene_col]] else: print( 'Warning!! {} not found for {}:{}-{} in row {}'.format( row[gene_col], row[chrom_col], row[start_col], row[end_col], i)) genes.append(','.join(genes_row)) if len(genes_row) == 1 or tx_col: trans_row, exons_row = self.parse_ref_exons( row[chrom_col], row[start_col], row[end_col], gene=genes_row[0], tx_col=tx_col ) # TODO - add fucntionality to choose gene and transcript elif len(genes_row) == 0: trans_row, exons_row = self.parse_ref_exons(row[chrom_col], row[start_col], row[end_col], tx_col=tx_col) else: trans_row = '' exons_row = '' exons.append(exons_row) transcripts.append(trans_row) new_df = pd.DataFrame( { 'genes': genes, 'exons': exons, 'transcript': transcripts }, index=df.index) return new_df def annotate_variants(self, rsid_array, extra_cols=[]): """ Get chom:start-end for a list of variants """ result = { 'chrom': [], 'start': [], 'end': [], 'rsid': [], 'allele': [], 'vartype': [], 'consequence': [] } for extra in extra_cols: result[extra] = [] response = self.get_rsids(rsid_array) for var in rsid_array: if var not in response: continue mapping = response[var]['mappings'][0] result['chrom'].append(mapping['seq_region_name']) result['start'].append(mapping['start']) result['end'].append(mapping['end']) result['rsid'].append(var) result['allele'].append(mapping['allele_string']) result['vartype'].append(response[var]['var_class']) result['consequence'].append( response[var]['most_severe_consequence']) for extra in extra_cols: result[extra].append(response[var][extra]) return pd.DataFrame(result) def annotate_cds_regions(self, df, tx_col='NM', cds_col='MutationName'): chroms = [] starts = [] ends = [] for _, row in df.iterrows(): location = self.get_cds_region(row[tx_col], row[cds_col]) chroms.append(location[0]) starts.append(location[1]) ends.append(location[2]) df['chrom'] = chroms df['start'] = starts df['end'] = ends return df