def create_queries(): """Create and store the pickled queries dataframes.""" df1 = apy.query(attributes=["ensembl_gene_id", "external_gene_name"], filters={"chromosome_name": "1"}, dataset="hsapiens_gene_ensembl") df1.to_pickle(os.path.join(DATADIR, "query_hsapiens_gene_chrom_1.pkl")) df2 = apy.query(attributes=["ensembl_gene_id", "external_gene_name"], filters={"chromosome_name": "2"}, dataset="hsapiens_gene_ensembl") df2.to_pickle(os.path.join(DATADIR, "query_hsapiens_gene_chrom_2.pkl")) df3 = apy.query(attributes=["ensembl_gene_id", "external_gene_name"], filters={"chromosome_name": "3"}, dataset="hsapiens_gene_ensembl") df3.to_pickle(os.path.join(DATADIR, "query_hsapiens_gene_chrom_3.pkl"))
def getBiomart(fname): # marts = find_marts() # # ENSEMBL_MART_ENSEMBL == Ensembl Genes 101 # print(marts) # ds = find_datasets(mart="ENSEMBL_MART_ENSEMBL") # # print(ds) # qry = ds["Dataset_name"].str.contains('[Hh][Uu][Mm][Aa][Nn]') # print(ds[qry]) # attrs = find_attributes(dataset="hsapiens_gene_ensembl") # print(attrs) if not os.path.isfile(fname): print("Downloading Biomart ...") attrs = ["ensembl_gene_id","chromosome_name","start_position", "end_position","strand","band","percentage_gene_gc_content", "gene_biotype","external_gene_name"] chrs = [str(i) for i in range(1,22)] chrs.append('X') chrs.append('Y') bm = query(attributes=attrs, filters={"chromosome_name": chrs}, dataset="hsapiens_gene_ensembl") # print(bm["Chromosome/scaffold name"].value_counts()) bm = bm[bm["Gene type"] == "protein_coding"] bm.columns = ['stableID', 'chromName', 'gStart','gEnd', 'strand', 'band', 'gcCont', 'gBiotype', 'gName'] bm.to_csv(fname,sep="\t", index=False) else: print("Reading Biomart ...") bm = pd.read_csv(fname,sep="\t") print("Biomart genes: ", bm.shape[0]) return bm
def test_query_ensembl(df_query_ensembl_hsapiens_gene_chrom_2): """Test the query results for the hsapiens_gene_ensembl dataset.""" expect = (df_query_ensembl_hsapiens_gene_chrom_2 .reset_index(drop=True)) result = (query(attributes=["ensembl_gene_id", "external_gene_name"], filters={"chromosome_name": "2"}, dataset="hsapiens_gene_ensembl") .reset_index(drop=True)) assert_frame_equal(result, expect)
def test_query_default_int(df_query_ensembl_hsapiens_gene_chrom_2): """Test the query results for the default dataset (hsapiens_gene_ensembl) with int filters parameter.""" expect = (df_query_ensembl_hsapiens_gene_chrom_2 .reset_index(drop=True)) result = (query(attributes=["ensembl_gene_id", "external_gene_name"], filters={"chromosome_name": 2}) .reset_index(drop=True)) assert_frame_equal(result, expect)
def test_query_save(df_query_ensembl_hsapiens_gene_chrom_2): """Test the saved query results for the default dataset (hsapiens_gene_ensembl).""" expect = (df_query_ensembl_hsapiens_gene_chrom_2.reset_index(drop=True)) _ = query(attributes=["ensembl_gene_id", "external_gene_name"], filters={"chromosome_name": "2"}, save=True) saved = pd.read_csv("apybiomart_query.csv") result = (saved.replace(np.nan, "").reset_index(drop=True)) try: assert_frame_equal(result, expect) finally: os.remove("apybiomart_query.csv")
def response(self): """Retrieve the related information for the given variant. Results are converted to a dictionary by the .to_dict() method of the pandas dataframe returned. If no results are retrieved, the empty dict is returned. """ resp = apy.query(attributes=[ "allele", "ensembl_gene_stable_id", "refsnp_id", "consequence_allele_string", "consequence_type_tv" ], filters={ "chr_name": "MT", "start": str(self.position), "end": str(self.position) }, dataset=f"{self.species}_snp") resp.drop_duplicates("Variant alleles", inplace=True) return resp.to_dict(orient="records")
def pull_ensembl(complete_file): f = find_datasets() cols = set([ "ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source", "external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id', 'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene' ]) for ds in f['Dataset_ID']: print(ds) outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}') #Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our # config, and keep it up to date. Maybe you could have a job that gets the datasets and writes a dataset file, # but then updates the config? That sounds bogus. if os.path.exists(outfile): continue atts = find_attributes(ds) existingatts = set(atts['Attribute_ID'].to_list()) attsIcanGet = cols.intersection(existingatts) df = query(attributes=attsIcanGet, filters={}, dataset=ds) df.to_csv(outfile, index=False, sep='\t') with open(complete_file, 'w') as outf: outf.write(f'Downloaded gene sets for {len(f)} data sets.')
# Define input/output files infile = sys.argv[1] out01 = sys.argv[2] out03 = sys.argv[3] # Read in the APA-Scan .tsv file apa = pd.read_csv(infile, sep="\t") # Format the APA-Scan df so that the gene names are uppercase apa["Gene Name"] = apa["Gene Name"].str.upper( ) # Need to do this as APA-Scan outputs genenames lowercase. Not actually sure what naming convention that is but uppercasing means they should work for the apybiomart queries # Run apybiomart query bmart = query(attributes=["ensembl_gene_id", "hgnc_symbol"], filters={}, dataset="hsapiens_gene_ensembl") # Merge dataframes to get common outdf = pd.merge(bmart, apa, left_on="HGNC symbol", right_on="Gene Name") # Rearrange to BED format (Format 01)/Format 03 and remove any duplicated rows (PolyA sites) outbed = outdf[[ "Chrom", "Start", "End", "Gene stable ID", "p-value", "Strand" ]].drop_duplicates() outbed[ "p-value"] = "." # Set p-value filed to "." so that it matches the BED format (Format 01) specification from the Execution Workflows README: https://github.com/iRNA-COSI/APAeval/tree/main/execution_workflows out03 = outdf[["Gene stable ID", "p-value"]].drop_duplicates() # Remove existing files (else pandas will append when saving) if os.path.exists(out01):