def melodi_gwas(): # gwasInfo={'1':'leptin','2':'adiponectin'} # gwasInfo={'1':'pcsk9','2':'adiponectin'} # gwasInfo = { # "1":"Operative procedures - main OPCS: E03.5 Incision of septum of nose", # "2":"Type of cancer: ICD10: C83.7 Burkitt's tumour", # "3":"Type of cancer: ICD10: C92.0 Acute myeloid leukaemia", # } logger.info("getting gwas info") gwasInfo = get_gwas_data() # create test set gwasInfoTest = {k: gwasInfo[k] for k in list(gwasInfo)[:10]} logger.info(len(gwasInfoTest)) #gwasInfo = gwasInfoTest # enrich in parallel gwasChunks = chunks(gwasInfo,10) pool = mp.Pool(processes=10) results = pool.starmap(enrich, [(gwasData,gwasInfo) for gwasData in gwasChunks]) pool.close() #create single file filename = f"gwas-melodi-enrich-{today}.tsv.gz" com = f"for i in melodi/*; do tail -n +2 $i; done | gzip > {filename}" subprocess.call(com, shell=True) copy_source_data(data_name=data_name,filename=filename)
def pathways(): # pathways # complete list url = "https://reactome.org/download/current/ReactomePathways.txt" logger.info(url) df1 = pd.read_csv(url, sep="\t") df1.columns = ["reactome_id", "name", "species"] df1 = df1[df1["species"] == "H**o sapiens"] logger.info(df1.head()) filename = f"/tmp/ReactomePathways_human_{today}.csv" df1.to_csv(filename, index=False) copy_source_data(data_name=data_name, filename=filename) # hierarchy url = "https://reactome.org/download/current/ReactomePathwaysRelation.txt" logger.info(url) df2 = pd.read_csv(url, sep="\t") df2.columns = [ "parent", "child", ] logger.info(df2.head()) logger.info(df2.shape) df2 = df2[df2["parent"].isin(df1["reactome_id"])] logger.info(df2.shape) filename = f"/tmp/ReactomePathwaysRelation_human_{today}.csv" df2.to_csv( filename, index=False, ) copy_source_data(data_name=data_name, filename=filename)
def process_variants(variant_file): df = pd.read_csv(variant_file, low_memory=False) df = df["rsid"] df.drop_duplicates(inplace=True) logger.info(df.head()) # in this example, only run 100 variants as can be quite slow filename = f"{vep_data_dir}/variants-{today}.txt" df.head(n=100).to_csv(filename, index=False, header=False) copy_source_data(data_name=data_name, filename=filename)
def get_ebi_gwas_data(): # retrieve EBI GWAS data data ebi_gwas_api_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/studies_alternative" print("Getting GWAS data from EBI GWAS Catalog", ebi_gwas_api_url) ebi_gwas = requests.get(ebi_gwas_api_url) # save the full dataset with open(ebi_gwas_data_file, 'wb') as tsvfile: tsvfile.write(ebi_gwas.content) copy_source_data(data_name=data_name, filename=ebi_gwas_data_file)
def biomart_to_file(atts, filename, type): logger.info("attributes: {} filename: {}", atts, filename) # latest build # server = BiomartServer( "http://www.ensembl.org/biomart" ) # build 37 server = biomart.BiomartServer("http://grch37.ensembl.org/biomart") hge = server.datasets["hsapiens_gene_ensembl"] # print(hge.show_attributes()) s = hge.search({"attributes": atts}, header=1) o = gzip.open(filename, "w") c = 0 for l in s.iter_lines(): if c > 0: chr = l.decode("utf-8").split("\t")[0] if chr in [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", ]: # print(l.decode('utf-8').split('\t')[0]) # added binary b for python 3 if type == "protein": chr, gene, protein = l.decode("utf-8").split("\t") if len(protein) > 1: o.write(l + b"\n") else: o.write(l + b"\n") c += 1 # copy to data directory copy_source_data(data_name, filename)
def get_top_hits(): df = pd.read_csv(gwas_data_file, low_memory=False) gwas_ids = list(df.id) logger.info(gwas_ids[0:10]) gwas_api_url = "http://gwasapi.mrcieu.ac.uk/tophits" payload = {"id": gwas_ids, "preclumped": 1} response = requests.post(gwas_api_url, json=payload) res = response.json() th_df = pd.json_normalize(res) th_df.to_csv(gwas_tophits, index=False) copy_source_data(data_name=data_name, filename=gwas_tophits)
def create_clean_protein(protein_data): filename = f"/tmp/protein-only-{today}.txt" o = open(filename, "w") pCheck = {} with gzip.open(protein_data) as f: for line in f: chr, gene, uni = line.decode("utf-8").split("\t") if uni not in pCheck: o.write(uni) pCheck[uni] = "" o.close() copy_source_data(data_name, filename)
def select_ebi_gwas_efo_mapping(): # keep only required columns: GWAS ID and EFO df = pd.read_csv(ebi_gwas_data_file, sep='\t') df["GWAS_ID"] = "ebi-a-" + df["STUDY ACCESSION"] df = df[["GWAS_ID", "MAPPED_TRAIT_URI"]].drop_duplicates() df.columns = ["gwas.id", "efo.id"] print(df.head()) print(df.shape) # subset the full dataset to GWAS that are present in OpenGWAS df = subset_to_available_gwas(df) print(df.shape) df.to_csv(ebi_gwas_efo_mapping, sep="\t", index=False) copy_source_data(data_name=data_name, filename=ebi_gwas_efo_mapping)
def get_gwas_data(): # create the data gwas_api_url = "http://gwasapi.mrcieu.ac.uk/gwasinfo" logger.info("Getting gwas data from {}", gwas_api_url) gwas_res = requests.get(gwas_api_url).json() outData = open(gwas_data_file, "w") df = pd.DataFrame(gwas_res) df = df.T.fillna("") logger.info(df.head()) logger.info(df["year"].describe()) df.to_csv(outData, index=False) outData.close() copy_source_data(data_name=data_name, filename=gwas_data_file)
def main(oFile) -> None: gene_id_list = get_ensembl_id() with Pool(N_PROCS) as pool: nested_list = [ gene_id_list[i:(i + N_PER_CHUNK)] for i in range(0, len(gene_id_list), N_PER_CHUNK) ] map_res = pool.map(get_ot_data, nested_list) ot_df = pd.concat(map_res, ignore_index=True) OPENTARGETS_DIR.mkdir(parents=True, exist_ok=True) ot_df.to_csv(oFile, index=False) copy_source_data(data_name=data_name, filename=oFile)
def get_variants_from_graph(): # collect to epigraph driver = neo4j_connect() session = driver.session() # query query = """ match (v:Variant) return distinct(v._id) as id limit 100 """ logger.info(query) query_data = session.run(query).data() df = pd.json_normalize(query_data) df.to_csv(variant_data, index=False) copy_source_data(data_name=data_name, filename=variant_data) return df
def make_tidy_clinvar_output(df): # make tidy dates df['LastUpdated'] = pd.to_datetime( df['LastUpdated']).dt.strftime('%Y-%m-%d') # subset and rename columns df = df[[ "Gene name", "Gene stable ID", "GeneType", "DiseaseName", "ConceptID", "SourceName", "SourceID", "DiseaseMIM", 'LastUpdated' ]] df.columns = [ "gene_name", "ensembl_id", "clinvar_gene_type", "disease_name", "umls_id", "source_name", "source_id", "disease_MIM", "last_updated" ] df.to_csv(clinvar_gene_condition_mapping, sep="\t", index=False) copy_source_data(data_name=data_name, filename=clinvar_gene_condition_mapping)
def run_vep(variant_dir, variant_file): com = """ docker run -t -i -v {vep_data_dir}:/opt/vep/.vep ensemblorg/ensembl-vep ./vep --port 3337 --cache --fork 20 --assembly GRCh37 -i /opt/vep/.vep/{variant_file} -o /opt/vep/.vep/vep-{today}.txt --per_gene --no_intergenic """.format( vep_data_dir=vep_data_dir, variant_file=variant_file, today=today ) com = com.replace("\n", " ") logger.info(com) subprocess.call(com, shell=True) # copy results #com = f"cp /data/vep_data/vep-{today}.txt {env_configs['data_dir']}/vep/" #subprocess.call(com, shell=True) copy_source_data(data_name=data_name,filename=f'{vep_data_dir}/vep-{today}.txt')
def protein_to_pathway(): # protein to pathway url = "https://reactome.org/download/current/UniProt2Reactome_All_Levels.txt" logger.info(url) df = pd.read_csv(url, sep="\t") df.columns = [ "source_id", "reactome_id", "url", "event", "evidence_code", "species", ] df = df[df["species"] == "H**o sapiens"] logger.info(df.head()) filename = f"/tmp/UniProt2Reactome_All_Levels_human_{today}.csv" df.to_csv( filename, index=False, ) copy_source_data(data_name=data_name, filename=filename)
def get_phewas(): df = pd.read_csv(variant_data) variant_ids = list(df.id) split_val = 20 pval = 1e-5 all_res = [] for i in range(0, len(variant_ids), split_val): print(i) variants = variant_ids[i:i + split_val] gwas_api_url = "http://gwasapi.mrcieu.ac.uk/phewas" payload = {"variant": variants, "pval": pval} #logger.info(payload) response = requests.post(gwas_api_url, json=payload) res = response.json() logger.info(len(res)) if len(res) == 1: logger.info('Failed') exit() all_res.extend(res) df = pd.json_normalize(all_res) logger.info(df) df.to_csv(phewas_data_file, index=False) copy_source_data(data_name=data_name, filename=phewas_data_file)
def map_genes_to_diseases(): df = pd.read_csv(clinvar_gene_condition_mapping, sep='\t') # firstly get genes that directly map to mondo_id in clinvar data df_mondo = make_gene_to_mondo_map(df) # map umls_id in clinvar to mondo_id from the graph df_umls = make_umls_to_mondo_map(df) # join clinvar table with query output to map umls_id to mondo_id df_joined = df.merge(df_umls, on='umls_id', how='inner') df_joined = df_joined[[ 'ensembl_id', 'mondo_id', 'clinvar_gene_type', 'last_updated' ]] # concat direct mondo mappings with mappings via umls_id; drop any dups df_total = df_joined.append(df_mondo).drop_duplicates() df_total.to_csv(clinvar_gene_condition_mapping_mondo, sep="\t", index=False) copy_source_data(data_name=data_name, filename=clinvar_gene_condition_mapping_mondo)
def create_distances(gwas_df): logger.info("Creating distances...") # https://stackoverflow.com/questions/48838346/how-to-speed-up-computation-of-cosine-similarity-between-set-of-vectors vectors = [] ids = [] for i, j in gwas_df.iterrows(): vectors.append(j["embedding"]) ids.append(i) timestr = time.strftime("%Y%m%d") score_cutoff = 0 filename = f'/tmp/ieu-gwas-cosine-{timestr}-{score_cutoff}.tsv.gz' o = gzip.open(filename, "wt") logger.info(len(vectors)) data = np.array(vectors) pws = distance.pdist(data, metric="cosine") logger.info(len(pws)) logger.info(len(ids)) logger.info("Writing to file...") mCount = 0 for i in range(0, len(ids)): for j in range(i, len(ids)): if i != j: # print(ids[i],ids[j],1-pws[mCount]) score = 1 - pws[mCount] if score > score_cutoff: t = f"{ids[i]}\t{ids[j]}\t{str(score)}\n" o.write(t) mCount += 1 o.close() logger.info(mCount) copy_source_data(data_name, filename)
def download_data(): link = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/gene_condition_source_id' wget.download(link, clinvar_data_file) copy_source_data(data_name=data_name, filename=clinvar_data_file)
def download_data(): link = 'https://api.cpicpgx.org/data/cpicPairs.csv' wget.download(link, cpic_data_file) copy_source_data(data_name=data_name, filename=cpic_data_file)
import os from workflow.scripts.utils.general import copy_source_data data_name = "string" # uniprot mapping uniprot_file = '/tmp/human.uniprot_2_string.2018.tsv.gz' url = 'https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz' os.system(f"wget -O {uniprot_file} {url}") copy_source_data(data_name=data_name, filename=uniprot_file) # string data pp_file = '/tmp/9606.protein.links.v11.0.txt.gz' url = 'https://stringdb-static.org/download/protein.links.v11.0/9606.protein.links.v11.0.txt.gz' os.system(f"wget -O {pp_file} {url}") copy_source_data(data_name=data_name, filename=pp_file)