def test_mhc_predictor_error(): genome = EnsemblRelease(species="mouse") wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0] protein_fragment = MutantProteinFragment( variant=Variant('X', '8125624', 'C', 'A'), gene_name='Wdr13', amino_acids='KLQGHSAPVLDVIVNCDESLLASSD', mutant_amino_acid_start_offset=12, mutant_amino_acid_end_offset=13, n_overlapping_reads=71, n_alt_reads=25, n_ref_reads=46, n_alt_reads_supporting_protein_sequence=2, supporting_reference_transcripts=[wdr13_transcript]) # throws an error for each prediction, make sure vaxrank doesn't fall down class FakeMHCPredictor: def predict_subsequences(self, x): raise ValueError('I throw an error in your general direction') epitope_predictions = predict_epitopes( mhc_predictor=FakeMHCPredictor(), protein_fragment=protein_fragment, genome=genome) eq_(0, len(epitope_predictions))
def main(opts): # load ensembl db data = EnsemblRelease(95) # read in fusion file df = pd.read_csv(opts['input'], sep='\t') output_list = [] for ix, row in df.iterrows(): # extract gene / tx gene5 = row["5'_gene"] gene3 = row["3'_gene"] tx_id5 = row["5'_transcript"] tx_id3 = row["3'_transcript"] # fetch prot sequence tx5 = data.transcript_by_id(tx_id5) tx3 = data.transcript_by_id(tx_id3) prot5 = tx5.protein_id prot3 = tx3.protein_id prot_seq5 = tx5.protein_sequence prot_seq3 = tx3.protein_sequence # append output output_list.append([gene5, tx_id5, prot5, prot_seq5]) output_list.append([gene3, tx_id3, prot3, prot_seq3]) # save output output_df = pd.DataFrame(output_list, columns=['gene', 'transcript_id', 'protein_id', 'protein_sequence']) output_df.drop_duplicates(subset=['gene', 'transcript_id', 'protein_id']).to_csv(opts['output'], sep='\t', index=False)
def __init__(self, VCFFileObject, cytoband, mg, es): self.VCFFile = VCFFileObject self.cytobandDict = cytoband self.mg = mg self.es = es self.GeneToProteinDict = {} self.GenomeBuild = self.VCFFile.GenomeBuild self.datahg19 = EnsemblRelease(75) self.datahg38 = EnsemblRelease(87) self.datahg18 = EnsemblRelease(54) self.header = self.VCFFile.header self.vcf = self.VCFFile.vcf self.info_dict = self.VCFFile.info_dict self.format_dict = self.VCFFile.format_dict self.organism = "H**o Sapiens" self.snpLink = "None" self.referenceBuild = 75 self.server = 'http://rest.ensembl.org' self.geneName = "None" self.ENSG = "None" self.geneLink = "None" self.cytoband = "None" self.mapping_url = 'http://www.uniprot.org/mapping/' self.uniprot_url = 'http://www.uniprot.org/uniprot/' self.proteinName = "None" self.UniProtLink = "None" self.genotype_format = {} self.vcf_info = {} self.bulk_action = []
def ENSEMBLID_to_geneSymbol(ENSEMBL, Ensembl_Release=75): data = EnsemblRelease(Ensembl_Release) if type(ENSEMBL) is list: Genes = list(map(data.gene_name_of_gene_id, ENSEMBL)) else: Genes = data.gene_name_of_gene_id(ENSEMBL) return Genes
def random_variants(count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] while len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset(base1_genomic_position) try: seq = transcript.sequence except ValueError as e: logging.warn(e) # can't get sequence for non-coding transcripts continue ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant(transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) return VariantCollection(variants)
def __init__(self, release, species, output, best_file=None, alias_file=None, custom_cache=None): self.annotation = {} self.custom_cache = custom_cache self.cache_prefix = None self.gen_time = get_date() self.release = release self.species = species self.output = output self.best_file = best_file self.alias_file = alias_file if self.alias_file: self.alias = parse_alias_file(self.alias_file) else: self.alias = defaultdict(set) self.data = EnsemblRelease(release, species) self.download_pyensembl_cache() self.get_domain_cache() if self.best_file: self.best = parse_best_file(self.best_file) else: self.best = self.choose_best_transcripts() self.build_json()
def main(): ensembl_num = 81 #fetch before running program using command: pyensembl install --release <list of Ensembl release numbers> --species <species-name> gen_ref = EnsemblRelease(ensembl_num) output_file = open(sys.argv[2], "w") #output filename firstline = True input_file = open(sys.argv[1]).read().split("\r") for line in input_file: if firstline: output_file.write(line + "\r") firstline = False continue parameters = strip_values(line) if parameters == "nothing": output_file.write(line + "\r") else: gene_name = gen_ref.gene_names_at_locus(contig=parameters[0], position=parameters[1], end =parameters[2]) for gene in gene_name[0:len(gene_name)]: line = line + "," + gene output_file.write(line + "\r") output_file.write("Generated on " + time.strftime("%m/%d/%Y") + " with Ensemble Release " + str(ensembl_num) + " and locus_serach.py v1." ) output_file.close()
def test_mhc_predictor_error(): genome = EnsemblRelease(species="mouse") wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0] protein_fragment = MutantProteinFragment( variant=Variant('X', '8125624', 'C', 'A'), gene_name='Wdr13', amino_acids='KLQGHSAPVLDVIVNCDESLLASSD', mutant_amino_acid_start_offset=12, mutant_amino_acid_end_offset=13, n_overlapping_reads=71, n_alt_reads=25, n_ref_reads=46, n_alt_reads_supporting_protein_sequence=2, supporting_reference_transcripts=[wdr13_transcript]) # throws an error for each prediction, make sure vaxrank doesn't fall down class FakeMHCPredictor: def predict_subsequences(self, x): raise ValueError('I throw an error in your general direction') epitope_predictions = predict_epitopes(mhc_predictor=FakeMHCPredictor(), protein_fragment=protein_fragment, genome=genome) eq_(0, len(epitope_predictions))
def __init__(self, genome='hg19'): if genome == 'hg19': self.version = 75 self.rest_url = "http://grch37.rest.ensembl.org" else: self.version = 77 self.rest_url = "http://rest.ensembl.org" self.db = EnsemblRelease(self.version)
class ScrapeEnsembl(): ''' ''' def __init__(self, query, hg_version): self.query = query.replace("chr","") self.hg_version = ScrapeEnsembl.genome.get(hg_version) # convert to ensembl release self.hg = EnsemblRelease(self.hg_version) # convert to ensembl release object genome = {"hg19": 75, "hg38": 83} def get_gene_info(self): ''' Get the gene information at a given genomic position ''' # check if the input is a genomic position or genomic range if re.search(r"[-:]", self.query) and self.query.replace(":","").isdigit(): chrom = int(self.query.split(":")[0]) pos = int(self.query.split(":")[1]) gene_name = self.hg.gene_names_at_locus(contig=chrom, position=pos) if not gene_name: msg = " ".join(("No gene found at",self.query,"for genome version", str(self.hg_version))) return msg gene_info = self.hg.genes_by_name(gene_name[0]) # gene_info[0].loaction doesn't work, hence the mess below gene_location = str(gene_info[0]).split(",")[-1][:-1].split("=")[1] gene_info = (gene_info[0].name, gene_info[0].id, gene_info[0].biotype, gene_location) return(gene_info) def get_canonical_transcript(self, gene_name): ''' Determine and return the canonical transcript of the given gene ''' all_transcripts = self.hg.transcript_ids_of_gene_name(gene_name) all_transcript_details = [self.hg.transcript_by_id(x) for x in all_transcripts] protein_coding_transcripts = [] for x in all_transcript_details: split_transcript_info = re.split(r"[=,]",str(x)) transcript = split_transcript_info[1] transcript_type = split_transcript_info[9] location = split_transcript_info[-1][:-1] start = re.split(r"[:-]", location)[1] stop = re.split(r"[:-]", location)[2] size = int(stop) - int(start) if transcript_type == "protein_coding": protein_coding_transcripts.append((size,transcript,transcript_type)) # sort by size and return the largest protein coding transcript if protein_coding_transcripts: canonical_transcript = sorted(protein_coding_transcripts)[-1][1] return canonical_transcript
def mapping_id(self): esb = EnsemblRelease(77) self.data_filtered['ensembl_id_1'] = [ esb.gene_id_of_protein_id(protein) for protein in self.data_filtered.protein1 ] self.data_filtered['ensembl_id_2'] = [ esb.gene_id_of_protein_id(protein) for protein in self.data_filtered.protein2 ]
def _test_db_index(mock_index, db_exists): """ Return True if the GTF database gets created, which should be different depending on whether the database already existed. Note: we need to mock the reference transcript indexing, as we're testing GTF indexing. """ data = EnsemblRelease(54) data.db._connect_if_exists = Mock(return_value=db_exists) data.db._create_database = Mock() data.index(force=False) return data.db._create_database.called
def main(): args = get_args() # n_processes = args.n_processes eventalign_filepath = args.eventalign summary_filepath = args.summary chunk_size = args.chunk_size out_dir = args.out_dir ensembl_version = args.ensembl ensembl_species = args.species readcount_min = args.readcount_min readcount_max = args.readcount_max resume = args.resume genome = args.genome customised_genome = args.customised_genome if customised_genome and (None in [ args.reference_name, args.annotation_name, args.gtf_path_or_url, args.transcript_fasta_paths_or_urls ]): print( 'If you have your own customised genome not in Ensembl, please provide the following' ) print('- reference_name') print('- annotation_name') print('- gtf_path_or_url') print('- transcript_fasta_paths_or_urls') else: reference_name = args.reference_name annotation_name = args.annotation_name gtf_path_or_url = args.gtf_path_or_url transcript_fasta_paths_or_urls = args.transcript_fasta_paths_or_urls misc.makedirs(out_dir) #todo: check every level. # (1) For each read, combine multiple events aligned to the same positions, the results from nanopolish eventalign, into a single event per position. if not args.skip_eventalign_indexing: parallel_index(eventalign_filepath, summary_filepath, chunk_size, out_dir, n_processes, resume) # (2) Create a .json file, where the info of all reads are stored per position, for modelling. if genome: if customised_genome: db = Genome( reference_name=reference_name, annotation_name=annotation_name, gtf_path_or_url=gtf_path_or_url, transcript_fasta_paths_or_urls=transcript_fasta_paths_or_urls) # parse GTF and construct database of genomic features db.index() else: db = EnsemblRelease( ensembl_version, ensembl_species ) # Default: human reference genome GRCh38 release 91 used in the ont mapping. parallel_preprocess_gene(eventalign_filepath, db, out_dir, n_processes, readcount_min, readcount_max, resume) else: parallel_preprocess_tx(eventalign_filepath, out_dir, n_processes, readcount_min, readcount_max, resume)
def main(): args = get_args() # n_processes = args.n_processes eventalign_filepath = args.eventalign summary_filepath = args.summary out_dir = args.out_dir ensembl_version = args.ensembl ensembl_species = args.species readcount_min = args.readcount_min readcount_max = args.readcount_max resume = args.resume genome = args.genome misc.makedirs(out_dir) #todo: check every level. # (1) For each read, combine multiple events aligned to the same positions, the results from nanopolish eventalign, into a single event per position. eventalign_log_filepath = os.path.join(out_dir,'eventalign.log') if not helper.is_successful(eventalign_log_filepath): parallel_combine(eventalign_filepath,summary_filepath,out_dir,n_processes,resume) # (2) Create a .json file, where the info of all reads are stored per position, for modelling. if genome: ensembl = EnsemblRelease(ensembl_version,ensembl_species) # Default: human reference genome GRCh38 release 91 used in the ont mapping. parallel_preprocess_gene(ensembl,out_dir,n_processes,readcount_min,readcount_max,resume) else: parallel_preprocess_tx(out_dir,n_processes,readcount_min,readcount_max,resume)
def test_drop_duplicates(): ensembl = EnsemblRelease(78) v1 = Variant("1", 3000, "A", "G", ensembl=ensembl) v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl) v2 = Variant("2", 10, "G", "T", ensembl=ensembl) collection_without_duplicates = VariantCollection( variants=[v1, v1, v1_copy, v2]) assert len(collection_without_duplicates) == 2
def _get_annotation(adata, retries=3): """Insert meta data into adata.obs.""" from pyensembl import EnsemblRelease data = EnsemblRelease( adata.uns["release"], adata.uns["species"], ) for _ in range(retries): try: with patch_datacache(): data.download(overwrite=False) data.index(overwrite=False) break except TimeoutError: pass # get ensemble gene coordinate genes = [] for i in adata.var.index.map(lambda x: x.split(".")[0]): try: gene = data.gene_by_id(i) genes.append([ "chr%s" % gene.contig, gene.start, gene.end, gene.strand, ]) except ValueError: genes.append([np.nan, np.nan, np.nan, np.nan]) old_col = adata.var.columns.values adata.var = pd.concat( [adata.var, pd.DataFrame(genes, index=adata.var_names)], axis=1) adata.var.columns = np.hstack( [old_col, np.array(["chr", "start", "end", "strand"])])
def main(opts): # read in data df = pd.read_csv(opts['input'], sep='\t', header=None, names=['Gene1', 'Break1', 'Gene2', 'Break2']) tx = pd.read_csv(opts['transcript'], sep='\t') # merge in transcript rename_dict = {'symbol': 'Gene1', 'Ensembl_nuc': 'Transcript1'} df = pd.merge(df, tx.rename(columns=rename_dict)[['Gene1', 'Transcript1']], on='Gene1', how='left') rename_dict = {'symbol': 'Gene2', 'Ensembl_nuc': 'Transcript2'} df = pd.merge(df, tx.rename(columns=rename_dict)[['Gene2', 'Transcript2']], on='Gene2', how='left') # remove the transcript version number tmp1 = df['Transcript1'].str.split('.', expand=True)[0] tmp2 = df['Transcript2'].str.split('.', expand=True)[0] df['Transcript1'] = tmp1 df['Transcript2'] = tmp2 # merge in custom transcript custom_tx = pd.read_csv(opts['custom'], sep='\t') if len(custom_tx): # merge custom tx rename_dict = {'gene': 'Gene1', 'transcript_id': 'custom_transcript1'} df = pd.merge(df, custom_tx.rename(columns=rename_dict)[['Gene1', 'custom_transcript1']], on='Gene1', how='left') rename_dict = {'gene': 'Gene2', 'transcript_id': 'custom_transcript2'} df = pd.merge(df, custom_tx.rename(columns=rename_dict)[['Gene2', 'custom_transcript2']], on='Gene2', how='left') # replace mane tx with custom tx is_not_null = ~df['custom_transcript1'].isnull() df.loc[is_not_null, 'Transcript1'] = df.loc[is_not_null, 'custom_transcript1'] is_not_null = ~df['custom_transcript2'].isnull() df.loc[is_not_null, 'Transcript2'] = df.loc[is_not_null, 'custom_transcript2'] # Check transcripts data = EnsemblRelease(95) # figure out the appropriate canonical tx output_list = [] for ix, row in df.iterrows(): # get transcript mytx1 = pick_transcript(data, row['Gene1'], row['Break1'], row['Transcript1']) mytx2 = pick_transcript(data, row['Gene2'], row['Break2'], row['Transcript2']) # fill in whether to replace gene ID with transcript ID mygene1 = mytx1 if mytx1 else row['Gene1'] mygene2 = mytx2 if mytx2 else row['Gene2'] # append output output_list.append([mygene1, row['Break1'], mygene2, row['Break2']]) # save output file with open(opts['output'], 'w') as whandle: mywriter = csv.writer(whandle, delimiter='\t', lineterminator='\n') mywriter.writerows(output_list)
def ensembl_to_sym2(df): from pyensembl import EnsemblRelease data = EnsemblRelease(87) ensml_ids = df.index.tolist() sym_gene_list = [] ensembl_to_sym_dict = {} no_ensembl_list = [] for e in ensml_ids: try: g = data.gene_by_id(e) except ValueError: no_ensembl_list.append(e) if g.gene_name.strip() not in sym_gene_list: ensembl_to_sym_dict[e] = g.gene_name.strip() sym_gene_list.append(g.gene_name.strip()) df.drop(no_ensembl_list, inplace=True) renamed_df = df.rename(index=ensembl_to_sym_dict) renamed_df_grouped = renamed_df.groupby(renamed_df.index).first() numeric_df = renamed_df_grouped.convert_objects(convert_numeric=True) return(numeric_df)
def __init__(self, bounds, orients, res=10000, default_chrom='chrN', species='human', release=97, filter_=None): ref = EnsemblRelease(release, species=species) ref.download() ref.index() self.ref = ref if not filter_ is None: # a pre-defined gene list, such as cancer-related genes self.filter_list = set([g.upper() for g in filter_]) else: self.filter_list = None self.blocks = get_blocks(bounds, orients, res) self.chrom_name = default_chrom self.map_genes() self.file_handler = io.StringIO('\n'.join( ['\t'.join(list(map(str, g))) for g in self.genes]))
def get_ensembl_db( sp, annotation_version ): Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Downloading and indexing the Ensembl' + ' database release ' + str( annotation_version ) + ' for ' + sp + '.' ) ensembl_db = EnsemblRelease( release = annotation_version, species = sp ) # Download and index the database if not yet in the temporary folder Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Downloading the Ensembl' + ' database release ' + str( annotation_version) + ' for ' + sp + '.' ) try: ensembl_db.download() except Exception as e: raise DenCellORFException( 'EnsemblUtil.get_ensembl_db(): An error occurred trying to' + ' download the Ensembl database using pyensembl.', e ) Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Indexing the Ensembl' + ' database release ' + str( annotation_version) + ' for ' + sp + '.' ) try: ensembl_db.index() except Exception as e: raise DenCellORFException( 'EnsemblUtil.get_ensembl_db(): An error occurred trying to' + ' index the Ensembl database using pyensembl.', e ) return ensembl_db
def main(opts): # load ensembl db data = EnsemblRelease(95) output_list = [] column_list = None pattern = os.path.join(opts['input_dir'], '*') for d in glob.glob(pattern): # figure out break points mybase = os.path.basename(d) break1 = mybase.split('_')[0].split('-')[-1] break2 = mybase.split('_')[1].split('-')[-1] # read fasta prot_fa_paths = glob.glob(os.path.join(d, '*_protein.fa')) if prot_fa_paths: prot_fa_path = prot_fa_paths[0] p_id, tx_id, seq = read_fasta(prot_fa_path) else: p_id, tx_id, seq = '', '', '' # read in fusion info fus_info_path = glob.glob(os.path.join(d, '*fusion_transcripts.csv'))[0] with open(fus_info_path) as handle: myreader = csv.reader(handle, delimiter=',') tmp_columns = next(myreader) if column_list is None: column_list = tmp_columns tmp_list = next(myreader) # figure out the pos of the break codon_pos1, relative_pos1, prot_len1 = get_cds_pos(data, tmp_list[2], int(break1)) codon_pos2, relative_pos2, prot_len2 = get_cds_pos(data, tmp_list[3], int(break2)) # figure out the chromosome chrom1 = get_chrom(data, tmp_list[2]) chrom2 = get_chrom(data, tmp_list[3]) # append results output_list.append(tmp_list + [p_id+':'+break1+"-"+break2, tx_id, seq, chrom1, break1, chrom2, break2, codon_pos1, codon_pos2, relative_pos1, relative_pos2, prot_len1, prot_len2]) # merge results mycols = column_list+['ID', 'TX_ID', 'protein_sequence', 'chrom1', 'Break1', 'chrom2', 'Break2', 'CodonPos1', 'CodonPos2', 'RelativePos1', 'RelativePos2', 'ProtLen1', 'ProtLen2'] output_df = pd.DataFrame(output_list, columns=mycols) # add gene ID output_df['GENE_ID'] = output_df["5'_gene"] + '--' + output_df["3'_gene"] # save results output_df.to_csv(opts['output'], sep='\t', index=False)
def test_reference_peptide_logic(): genome = EnsemblRelease(species="mouse") wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0] protein_fragment = MutantProteinFragment( variant=Variant('X', '8125624', 'C', 'A'), gene_name='Wdr13', amino_acids='KLQGHSAPVLDVIVNCDESLLASSD', mutant_amino_acid_start_offset=12, mutant_amino_acid_end_offset=13, n_overlapping_reads=71, n_alt_reads=25, n_ref_reads=46, n_alt_reads_supporting_protein_sequence=2, supporting_reference_transcripts=[wdr13_transcript]) epitope_predictions = predict_epitopes( mhc_predictor=RandomBindingPredictor(["H-2-Kb"]), protein_fragment=protein_fragment, genome=genome) # occurs in protein ENSMUSP00000033506 prediction_occurs_in_reference = epitope_predictions[('NCDESLLAS', 'H-2-Kb')] prediction_does_not_occur_in_reference = epitope_predictions[('LDVIVNCDE', 'H-2-Kb')] ok_(prediction_occurs_in_reference.occurs_in_reference) ok_(not prediction_does_not_occur_in_reference.occurs_in_reference) # construct a simple vaccine peptide having these two predictions, which makes it easy to check # for mutant/WT scores from single contributors vaccine_peptide = VaccinePeptide(protein_fragment, [ prediction_occurs_in_reference, prediction_does_not_occur_in_reference ]) eq_(prediction_occurs_in_reference.logistic_epitope_score(), vaccine_peptide.wildtype_epitope_score) eq_(prediction_does_not_occur_in_reference.logistic_epitope_score(), vaccine_peptide.mutant_epitope_score)
def fetch_ensembl_release(path=None, release='75'): """Get pyensembl genome files""" from pyensembl import Genome, EnsemblRelease #this call should download the files genome = EnsemblRelease(release, species='human') genome.download(overwrite=False) genome.index(overwrite=False) genome.cache_directory_path = path print('pyensembl genome files cached in %s' % genome.cache_directory_path) #run_pyensembl_install() return
class Ensembl(object): def __init__(self): self.db = EnsemblRelease(75) def annotate_one_gene(self, location): chrom, start, stop = self.parse_location(location) return self.db.gene_names_at_locus(chrom, start, stop) @staticmethod def parse_location(loc): start, stop = loc.split('-') chrom, start = start.split(':') start, stop = int(start), int(stop) return chrom, start, stop
def test_reference_peptide_logic(): genome = EnsemblRelease(species="mouse") wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0] protein_fragment = MutantProteinFragment( variant=Variant('X', '8125624', 'C', 'A'), gene_name='Wdr13', amino_acids='KLQGHSAPVLDVIVNCDESLLASSD', mutant_amino_acid_start_offset=12, mutant_amino_acid_end_offset=13, n_overlapping_reads=71, n_alt_reads=25, n_ref_reads=46, n_alt_reads_supporting_protein_sequence=2, supporting_reference_transcripts=[wdr13_transcript]) epitope_predictions = predict_epitopes( mhc_predictor=RandomBindingPredictor(["H-2-Kb"]), protein_fragment=protein_fragment, genome=genome) # occurs in protein ENSMUSP00000033506 prediction_occurs_in_reference = epitope_predictions[('NCDESLLAS', 'H-2-Kb')] prediction_does_not_occur_in_reference = epitope_predictions[('LDVIVNCDE', 'H-2-Kb')] ok_(prediction_occurs_in_reference.occurs_in_reference) ok_(not prediction_does_not_occur_in_reference.occurs_in_reference) # construct a simple vaccine peptide having these two predictions, which makes it easy to check # for mutant/WT scores from single contributors vaccine_peptide = VaccinePeptide( protein_fragment, [prediction_occurs_in_reference, prediction_does_not_occur_in_reference]) eq_(prediction_occurs_in_reference.logistic_epitope_score(), vaccine_peptide.wildtype_epitope_score) eq_(prediction_does_not_occur_in_reference.logistic_epitope_score(), vaccine_peptide.mutant_epitope_score)
def parse_ref_exons(self): """ Return fasta reference with only the sequences needed""" ens_db = EnsemblRelease(75) try: exons = ens_db.exons_at_locus(self.chrom, self.start, self.stop) except ValueError as e: # Load pyensembl db raise e exon_array = np.zeros(self.stop - self.start) exon_numbers = self.get_exon_numbers(ens_db, exons[0].gene_name) for exobj in exons: start = exobj.start - self.start stop = exobj.end - self.start i = start while i < stop: exon_array[i] = 1 i += 1 # 2:29,448,326-29,448,432 exon 19 # exon 22 start: 29445210 # exon 18 end: 29449940 # intron 19: 29446395-29448326 # ATI initiation 29446768-29448326 return exon_array, exon_numbers[(exon_numbers['start'] > self.hist[0][0]) & (exon_numbers['start'] < self.hist[0][-1])]
def gene_Length(list_of_genes, ensembl_id=True, Ensembl_Release=75): gn = EnsemblRelease(Ensembl_Release) if ensembl_id: gene_pos = list(map(gn.locus_of_gene_id, list_of_genes)) gene_pos_end = [i.end for i in gene_pos] gene_pos_start = [i.start for i in gene_pos] else: gene_pos = list(map(gn.loci_of_gene_names, list_of_genes)) gene_pos_end = [i[0].end for i in gene_pos] gene_pos_start = [i[0].start for i in gene_pos] gene_len = np.array(gene_pos_end) - np.array(gene_pos_start) return pa.DataFrame({ 'gene_names': list_of_genes, "gene_length": gene_len / 1000 })
def fetch_ensembl_release(path=None, release='75'): """get pyensembl genome files""" from pyensembl import Genome, EnsemblRelease if path is not None: os.environ['PYENSEMBL_CACHE_DIR'] = path #this call should download the files genome = EnsemblRelease(release, species='human') genome.download() genome.index() #print ('pyensembl genome files cached in %s' %genome.cache_directory_path) return
def test_version_too_old_47(): EnsemblRelease(47)
def random_variants(count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] # we should finish way before this loop is over but just in case # something is wrong with PyEnsembl we want to avoid an infinite loop for _ in range(count * 100): if len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset( base1_genomic_position) seq = transcript.sequence ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant(transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) else: return VariantCollection(variants) raise ValueError(("Unable to generate %d random variants, " "there may be a problem with PyEnsembl") % count)
def random_variants( count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] # we should finish way before this loop is over but just in case # something is wrong with PyEnsembl we want to avoid an infinite loop for _ in range(count * 100): if len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset(base1_genomic_position) seq = transcript.sequence ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant( transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) else: return VariantCollection(variants) raise ValueError( ("Unable to generate %d random variants, " "there may be a problem with PyEnsembl") % count)
def test_str_version(): for version in range(54, MAX_ENSEMBL_RELEASE): EnsemblRelease(str(version))
def random_variants( count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] while len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset(base1_genomic_position) try: seq = transcript.sequence except ValueError as e: logging.warn(e) # can't get sequence for non-coding transcripts continue ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant( transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) return VariantCollection(variants)
genes_20_saliva.union(mygenes) len(mygenes) print mygenes print len(genes_20_blood.intersection(genes_20_saliva)) print len(genes_20_blood.union(genes_20_saliva)) print list(genes_20_blood) + list(genes_20_saliva) from matplotlib_venn import venn2 venn2([genes_20_blood, genes_20_saliva], set_labels=('genes_20_blood', 'genes_20_saliva') ) plt.savefig("/mnt/xfs1/home/asalomatov/Blood_vs_Saliva_genes_lt_20.png") plt.close() ### annotate with genes from pyensembl import EnsemblRelease data = EnsemblRelease(75) data.gene_names_at_locus(contig=1, position=100000) df_exome['gene_name'] = df_exome.apply(lambda x: data.gene_names_at_locus(contig=x['chr'], position=x['start'])) x = df_exome.apply(lambda x: data.gene_names_at_locus(contig=x['chr'], position=(x['start'] + x['end'])/2), axis=1) x df_exome.head() df_exome.tail() df_exome.shape df_exome.bin.value_counts() df_exome.isnull().sum() my_plot.set_xlabel("Customers") my_plot.set_ylabel("Sales ($)")
genes_20_blood = set() genes_20_saliva = set() import pandas as pd import numpy as np import os from pyensembl import EnsemblRelease data = EnsemblRelease(75) df_descr = pd.read_csv("/mnt/xfs1/scratch/asalomatov/BloodVsSaliva/BloodVsSaliva_Descr.csv") df_descr.head() for c in df_descr.columns: df_descr[c] = df_descr[c].map(str.strip) df_descr['fam'] = df_descr['fam_id'].apply(lambda x: x.split("-")[0]) df_descr['protocol'] = df_descr['fam_id'].apply(lambda x: x.split("-")[1]) for smpl in df_descr['smpl_id']: print smpl # fam = df_descr['fam_id'][df_descr.smpl_id.isin([smpl])].iloc[0] # rel = df_descr['relationship'][df_descr.smpl_id.isin([smpl])].iloc[0] prot = df_descr['protocol'][df_descr.smpl_id.isin([smpl])].iloc[0] print prot # desc = '_'.join([fam, rel]) # print desc fname='/mnt/xfs1/scratch/asalomatov/BloodVsSaliva/output/'+smpl+'-D1.final-exome-cvrg.txt' if not os.path.isfile(fname): print fname continue df_exome = pd.read_csv(fname, sep="\t", header=None) df_exome = df_exome[df_exome[0] != 'all'] df_exome.columns = ['chr', 'start', 'end', 'bin', 'length', 'ex_length', 'perc']
def test_version_is_not_numeric(): EnsemblRelease("wuzzle")
import os import sys import pickle import matplotlib.pyplot as plt import numpy as np import h5py import argparse from sklearn.decomposition import PCA from sklearn.linear_model import LinearRegression from matplotlib.colors import Normalize sys.path.insert(0, os.getcwd()) from src.utils.helpers import * import statsmodels.stats.multitest as smm from gprofiler import GProfiler from pyensembl import EnsemblRelease data = EnsemblRelease(77) import multiprocess as mp from tqdm import tqdm import time from pebble import ProcessPool, ProcessExpired from concurrent.futures import TimeoutError GTEx_directory = '/hps/nobackup/research/stegle/users/willj/GTEx' parser = argparse.ArgumentParser(description='Collection of experiments. Runs on the cluster.') parser.add_argument('-g', '--group', help='Experiment group', required=True) parser.add_argument('-n', '--name', help='Experiment name', required=True) parser.add_argument('-p', '--params', help='Parameters') args = vars(parser.parse_args()) group = args['group'] name = args['name']
def test_version_is_none(): EnsemblRelease(None)
else: return int(chr_str) - 1 def find_chromosome(bp): curr = 0 idx = None for ch, sz in enumerate(CHROMOSOME_SIZES): curr += sz if curr >= bp: idx = ch break if idx != None: return idx_to_chromosome(idx) # release 76 uses human reference genome GRCh38 ENSEMBL_DATA = EnsemblRelease(76) def getMockAnnotations(): return json.loads(open("mockAnnotations.json", "r").read()) def getMockData(): return json.loads(open("mockData.json", "r").read()) app = Flask(__name__) CORS(app) @app.route("/graphs") def graphs(): return json.dumps(["ld_score"]) @app.route("/track_info")
def test_version_too_old_1(): EnsemblRelease(1)
def __init__(self, query, hg_version): self.query = query.replace("chr","") self.hg_version = ScrapeEnsembl.genome.get(hg_version) # convert to ensembl release self.hg = EnsemblRelease(self.hg_version) # convert to ensembl release object
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from pyensembl import EnsemblRelease from varcode import (ExonicSpliceSite, Substitution, Variant, TranscriptMutationEffect) import pandas as pd from . import data_path ensembl = EnsemblRelease(75) def validate_transcript_mutation(ensembl_transcript_id, chrom, dna_position, dna_ref, dna_alt, aa_pos, aa_alt): variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl) effects = variant.effects() transcript_id_dict = { effect.transcript.id: effect for effect in effects if isinstance(effect, TranscriptMutationEffect) } assert ensembl_transcript_id in transcript_id_dict, \ "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict) effect = transcript_id_dict[ensembl_transcript_id] if isinstance(effect, ExonicSpliceSite):
def __init__(self): self.db = EnsemblRelease(75)