def gene(): FILE = get_source(meta_id, 1) data = os.path.join(dataDir, FILE) df = pd.read_csv(data, sep="\t") # add column names col_names = [ "chr", "type", "name", "description", "biomart_source", "ensembl_id", "start", "end", ] df.columns = col_names df.drop_duplicates(inplace=True) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (g:Gene) ASSERT g.ensembl_id IS UNIQUE", "CREATE INDEX ON :Gene(name)", "CREATE INDEX ON :Gene(chr)", ] create_constraints(constraintCommands, meta_id)
def process(): # load predicate data logger.info("loading predication data...") pred_df = pd.read_csv(os.path.join(dataDir, PREDICATION_FILE), sep=",", compression="gzip") pred_df["PMID"] = pred_df["PMID"].astype(str) logger.info("loading citation data...") df = pd.read_csv(os.path.join(dataDir, PUB_FILE), sep="\t", compression="gzip") df.columns = ["id", "issn", "dp", "edat", "year"] df["id"] = df["id"].str.replace("'", "") logger.info(df.shape) # merge with predication data df_merge = df.merge(pred_df["PMID"], left_on="id", right_on="PMID") logger.info(df_merge.shape) # drop PMID column df_merge.drop("PMID", inplace=True, axis=1) # make unique df_merge.drop_duplicates(inplace=True) logger.info(df_merge.shape) logger.info(df_merge.shape) logger.info("\n {}", df_merge.head()) create_import(df=df_merge, meta_id=args.name) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (s:Literature) ASSERT s.id IS UNIQUE;", ] create_constraints(constraintCommands, meta_id)
def process(): # select the file FILE = get_source(meta_id, 1) logger.info("Reading {}", FILE) df = pd.read_csv(os.path.join(dataDir, FILE)) # logger.info(df.columns) logger.info(df.shape) # drop some columns df.drop(["access", "priority", "coverage", ""], axis=1, inplace=True, errors="ignore") logger.info(df.shape) # create the csv and import data create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (g:Gwas) ASSERT g.id IS UNIQUE", "CREATE index on :Gwas(trait)", "CREATE index on :Gwas(filename)", ] create_constraints(constraintCommands, meta_id)
def process(): merge = merge_data(BIO_DATA, BIO_SEM) logger.info(merge.shape) create_import(df=merge, meta_id=args.name) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (s:Literature) ASSERT s.id IS UNIQUE;", ] create_constraints(constraintCommands, meta_id)
def protein(): data = os.path.join(dataDir, FILE) df = pd.read_csv(data, sep="\t") df.columns = ["uniprot_id"] df["name"] = df["uniprot_id"] create_import(df=df, meta_id=meta_id) constraintCommands = [ "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE", "CREATE index on :Protein(name);", ] create_constraints(constraintCommands, meta_id)
def process(): FILE = get_source(meta_id, 1) df = pd.read_csv(os.path.join(dataDir, FILE), low_memory=False) df = df[["rsid"]].drop_duplicates() # change column name to match schema df.rename(columns={"rsid": "name"}, inplace=True) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (v:Variant) ASSERT v.name IS UNIQUE;", ] create_constraints(constraintCommands, meta_id)
def process(): df = pd.read_csv(os.path.join(dataDir, FILE)) logger.info(df.head()) keep_cols = ["source_id"] df = df[keep_cols] df.rename(columns={"source_id": "uniprot_id"}, inplace=True) df["name"] = df["uniprot_id"] df.drop_duplicates(inplace=True) logger.info(df.head()) create_import(df=df, meta_id=meta_id) constraintCommands = [ "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE", "CREATE index on :Protein(name);", ] create_constraints(constraintCommands, meta_id)
def variant(): print("Reading...", FILE) df = pd.read_csv(os.path.join(dataDir, FILE)) print("Writing...") # create csv df.rename(columns={"rsid": "name"}, inplace=True) df = df[["name"]].drop_duplicates() create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (v:Variant) ASSERT v.name IS UNIQUE;", "CREATE index on :Variant(chr);", "CREATE index on :Variant(pos);", ] create_constraints(constraintCommands, meta_id)
def process(): # load predicate data logger.info("loading data...") df = pd.read_csv(os.path.join(dataDir, PREDICATION_FILE), sep=",", compression="gzip") logger.info(df.shape) # need to split subject and object ids by | df = (df.assign(subject_id=df.subject_id.str.split("|")).explode( "subject_id").reset_index(drop=True)) logger.info(df.shape) df = (df.assign(object_id=df.object_id.str.split("|")).explode( "object_id").reset_index(drop=True)) logger.info(df.shape) logger.info("\n {}", df) df["id"] = df["subject_id"] + ":" + df["predicate"] + ":" + df["object_id"] df["name"] = df["subject_name"] + " " + df["predicate"] + " " + df[ "object_name"] # keep_cols = ["predicate","subject_name","object_name","subject_type","object_type","subject_id","object_id","id"] keep_cols = ["predicate", "subject_id", "object_id", "id", "name"] #df = pd.DataFrame({"count": df.groupby(keep_cols).size()}).reset_index() df = df[keep_cols] df.drop_duplicates(subset=['id'], inplace=True) logger.info(df.shape) logger.info("\n {}", df.head()) #drop nas/rows with empty string df.replace('', np.nan, inplace=True) df.dropna(inplace=True) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (s:LiteratureTriple) ASSERT s.id IS UNIQUE", "CREATE INDEX ON :LiteratureTriple(name);", "CREATE INDEX ON :LiteratureTriple(subject_id);", "CREATE INDEX ON :LiteratureTriple(object_id);", "CREATE INDEX ON :LiteratureTriple(predicate);", ] create_constraints(constraintCommands, meta_id)
def process(): data = os.path.join(dataDir, FILE) df = pd.read_csv(data, sep=",") logger.info("\n {}", df.head()) keep_cols = ["molecule_name", "molecule_type", "chembl_uri"] df = df[keep_cols] col_names = ["label", "molecule_type", "id"] df.columns = col_names df.drop_duplicates(inplace=True) # set label to uppercase df["label"] = df["label"].str.upper() logger.info(df.shape) logger.info("\n {}", df.head()) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = ["CREATE index on :Drug(label);"] create_constraints(constraintCommands, meta_id)
def process(): df = pd.read_csv(os.path.join(dataDir, FILE)) logger.info(df.head()) keep_cols = ["reactome_id", "name"] df = df[keep_cols] df["url"] = "https://reactome.org/PathwayBrowser/#/" + df["reactome_id"] df.rename(columns={"reactome_id": "id"}, inplace=True) df.drop_duplicates(inplace=True) logger.info(df["url"].head()) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (p:Pathway) ASSERT p.id IS UNIQUE", "CREATE index on :Pathway(name);", ] create_constraints(constraintCommands, meta_id)
def process(): data = os.path.join(dataDir, FILE) # some bad rows caused by extra commas so need to skip df = pd.read_csv(data, sep=",", skiprows=1, error_bad_lines=False) keep_cols = ["Drug"] df = df[keep_cols] df.drop_duplicates(inplace=True) df.columns = ["label"] # set label to uppercase df["label"] = df["label"].str.upper() logger.info(df.shape) logger.info("\n {}", df.head()) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (s:Drug) ASSERT s.label IS UNIQUE;" ] create_constraints(constraintCommands, meta_id)
def process(): data = os.path.join(dataDir, FILE) gtex_df = pd.read_csv(data, sep="\t", skiprows=2) # create dataframe from tissue name columns tissue_names = list(gtex_df.columns)[2:] df = pd.DataFrame(tissue_names) df.columns = ["id"] df['name'] = df['id'] logger.info(df.shape) logger.info("\n {}", df.head()) df.drop_duplicates(inplace=True) create_import(df=df, meta_id=meta_id) constraintCommands = [ "CREATE CONSTRAINT ON (t:Tissue) ASSERT t.id IS UNIQUE", "CREATE CONSTRAINT ON (t:Tissue) ASSERT t.name IS UNIQUE", ] create_constraints(constraintCommands, meta_id)
def process(): data = os.path.join(dataDir, FILE) df = pd.read_csv(data, sep="\t") keep_cols = [ "ensembl_gene_id", "druggability_tier", "chr_b37", "start_b37", "end_b37", "description", "small_mol_druggable", "adme_gene", "bio_druggable", "hgnc_names", ] df = df[keep_cols] df.rename( columns={ "ensembl_gene_id": "ensembl_id", "chr_b37": "chr", "start_b37": "start", "end_b37": "end", "hgnc_names": "name", }, inplace=True, ) logger.info(df.shape) logger.info("\n {}", df.head()) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE INDEX ON :Gene(druggability_tier);", "CREATE CONSTRAINT ON (g:Gene) ASSERT g.ensembl_id IS UNIQUE", "CREATE INDEX ON :Gene(name)", "CREATE INDEX ON :Gene(chr)", ] create_constraints(constraintCommands, meta_id)
def process(): data = os.path.join(dataDir, FILE) # some bad rows caused by extra commas so need to skip df = pd.read_csv(data, sep=",", skiprows=1, error_bad_lines=False) keep_cols = [ "Gene", "Drug", "Guideline", "CPIC Level", "PharmGKB Level of Evidence", "PGx on FDA Label", ] df = df[keep_cols] df.drop_duplicates(inplace=True) df.columns = [ "target", "source", "guideline", "cpic_level", "pharmgkb_level_of_evidence", "pgx_on_fda_label", ] # set label to uppercase df["source"] = df["source"].str.upper() logger.info(df.shape) logger.info("\n {}", df.head()) create_import(df=df, meta_id=meta_id, import_type="load") load_text = f""" USING periodic commit 10000 load CSV from "file:///rels/{meta_id}/{meta_id}.csv.gz" as row FIELDTERMINATOR "," WITH row, coalesce(row[2],"NA") as guideline_data, coalesce(row[3],"NA") as level_data, coalesce(row[4],"NA") as pharmgkb_data, coalesce(row[5],"NA") as pgx_data match (g:Gene{{name:row[0]}}) match (d:Drug{{label:row[1]}}) merge (g)<-[c:CPIC{{guideline:guideline_data, cpic_level:level_data, pharmgkb_level_of_evidence:pharmgkb_data, pgx_on_fda_label:pgx_data,_source:["CPIC"]}}]-(d) return count(g); """ load_text = load_text.replace("\n", " ").replace("\t", " ") load_text = " ".join(load_text.split()) create_constraints([load_text], meta_id)
def link(): # map to ontologies load_text = [] # efo load_text.append(f""" USING PERIODIC COMMIT 10000 LOAD CSV FROM "file:///nodes/{args.name}/efo.csv" AS row FIELDTERMINATOR "," WITH row MATCH (e:Efo) where e.id = "http://www.ebi.ac.uk/efo/EFO_"+row[0] MATCH (d:Disease) where d.id = row[1] MERGE (e)<-[:MONDO_MAP_EFO{{_source:"Mondo"}}]-(d) return count(e) as efo_count """) # umls load_text.append(f""" USING PERIODIC COMMIT 10000 LOAD CSV FROM "file:///nodes/{args.name}/umls.csv" AS row FIELDTERMINATOR "," WITH row MATCH (s:LiteratureTerm) where s.id = row[0] MATCH (d:Disease) where d.id = row[1] MERGE (s)<-[:MONDO_MAP_UMLS{{_source:"Mondo"}}]-(d) return count(s) as umls_count """) #umls using text load_text.append(f""" MATCH (l:LiteratureTerm) MATCH (d:Disease) WHERE toLower(l.name) = toLower(d.label) MERGE (l)<-[:MONDO_MAP_UMLS{{_source:"Mondo"}}]-(d) RETURN count(d) """) load_text = [t.replace("\n", " ").replace("\t", " ") for t in load_text] # load_text = " ".join(load_text.split()) create_constraints(load_text, args.name)
def process(): df1 = pd.read_csv(os.path.join(dataDir, FILE1), sep=" ") # filter by score df1 = df1[df1["combined_score"] >= 700] logger.info(df1.shape) logger.info("\n {}", df1.head()) p1 = list(df1["protein1"]) p2 = list(df1["protein2"]) dfp = pd.DataFrame(p1 + p2) dfp.drop_duplicates(inplace=True) dfp.columns = ["protein"] logger.info(dfp.shape) logger.info("\n {}", dfp.head()) df2 = pd.read_csv(os.path.join(dataDir, FILE2), sep="\t") df2.columns = ["species", "uniprot", "protein", "x", "y"] df2["uniprot"] = df2["uniprot"].str.split("|", expand=True)[0] logger.info(df2.shape) logger.info("\n {}", df2.head()) # merge df_merge = dfp.merge(df2, left_on="protein", right_on="protein") df_merge = pd.DataFrame(df_merge["uniprot"]) df_merge.columns = ["uniprot_id"] df_merge['name'] = df_merge["uniprot_id"] df_merge.drop_duplicates(inplace=True) logger.info(df_merge.shape) logger.info("\n {}", df_merge.head()) create_import(df=df_merge, meta_id=args.name) constraintCommands = [ "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE", "CREATE index on :Protein(name);", ] create_constraints(constraintCommands, meta_id)
def process(): logger.info("loading semrep data...{}", SEM) sem_df = pd.read_csv(os.path.join(dataDir, SEM), sep=",", compression="gzip") #need to deal with cases where there is no id and only a gene_id logger.info('Dealing with IDs') sub_id = sem_df.apply(lambda row: make_id(row, 'sub'), axis=1) obj_id = sem_df.apply(lambda row: make_id(row, 'obj'), axis=1) sem_df['sub_id_all'] = sub_id sem_df['obj_id_all'] = obj_id logger.info("\n{}", sem_df) logger.info(sem_df.shape) # merge keep_cols = [ 'sub_name' 'sub_type' 'sub_id_all', 'obj_id_all', 'obj_name' 'obj_type' ] # need to split subject and object ids by , logger.info(sem_df.shape) sem_df = (sem_df.assign(sub_id_all=sem_df.sub_id_all.str.split( ",")).explode("sub_id_all").reset_index(drop=True)) logger.info(sem_df.shape) sem_df = (sem_df.assign(obj_id_all=sem_df.obj_id_all.str.split( ",")).explode("obj_id_all").reset_index(drop=True)) logger.info(sem_df.shape) # create series of subject/object names term_names = pd.concat([sem_df["sub_name"], sem_df["obj_name"]]) # create series of subject/object types term_types = pd.concat([sem_df["sub_type"], sem_df["obj_type"]]) # create series of subject/object ids term_ids = pd.concat([sem_df["sub_id_all"], sem_df["obj_id_all"]]) # create new df term_df = pd.concat([term_names, term_types, term_ids], axis=1) term_df.columns = ["name", "type", "id"] term_df.drop_duplicates(inplace=True) logger.info("\n{}", term_df) logger.info(term_df.shape) # some ids have multiple types - need to create a list of types for each ID # make a map of id to types id_type_dic = term_df.groupby(["id"])["type"].apply(list).to_dict() # make lists unique for i in id_type_dic: l = ";".join(list(set(id_type_dic[i]))) id_type_dic[i] = l # add type lists back as ; separated array term_df["type"] = term_df["id"].map(id_type_dic) term_df.drop_duplicates(inplace=True) logger.info("\n {}", term_df) create_import(df=term_df, meta_id=args.name) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (s:LiteratureTerm) ASSERT s.id IS UNIQUE", "CREATE index on :LiteratureTerm(type);", "CREATE index on :LiteratureTerm(name);", "match (l:LiteratureTerm ) match (g:Gene) where toLower(g.name) = toLower(l.name) merge (l)-[:TERM_TO_GENE{_source:\"LiteratureTerm\"}]->(g) return count(g);", ] create_constraints(constraintCommands, meta_id)
def process(): print("Processing") csv_data = [] print("Reading") masterOnt = create_ontoDic() with open(os.path.join(dataDir, FILE)) as json_file: data = json.load(json_file) for d in data["graphs"][0]["nodes"]: # only CLASS types mondo_type = d["type"] if mondo_type != "CLASS": continue # skip deprecated if "meta" in d: if "deprecated" in d["meta"]: if d["meta"]["deprecated"] == True: continue #logger.debug(d) if "lbl" not in d: continue mondo_id = d["id"] mondo_label = d["lbl"] definition = "NA" xrefs = [] ontoDic = create_ontoDic() if "meta" in d: if "definition" in d["meta"]: definition = d["meta"]["definition"]["val"].replace( '"', "").replace('\n', ' ') if "xrefs" in d["meta"]: for x in d["meta"]["xrefs"]: xrefs.append(x["val"]) for i in xrefs: ont, val = i.split(":", 1) if ont.lower() in masterOnt: masterOnt[ont.lower()].append({ "val": val, "mondo": mondo_id }) ontoDic = create_ontology_properties(xrefs) # print(mondo_id,mondo_label,definition,xrefs) # print(list(ontoDic.values())) oList = [";".join(o) for o in list(ontoDic.values())] # print("\t".join(oList)) d = [mondo_id, mondo_label, definition] d.extend(oList) csv_data.append(d) # print(masterOnt) for m in masterOnt: o = open(os.path.join(dataDir, m + ".csv"), "w") for i in masterOnt[m]: o.write(i["val"] + "," + i["mondo"] + "\n") o.close() # create csv file df = pd.DataFrame(csv_data) col_names = ["id", "label", "definition"] ontoLabels = list(ontoDic.keys()) col_names.extend(ontoLabels) df.columns = col_names print(df.head()) create_import(df=df, meta_id=meta_id) # create the constraints and indexes constraintCommands = [ "CREATE CONSTRAINT ON (d:Disease) ASSERT d.id IS UNIQUE", "CREATE index on :Disease(label);", "CREATE index on :Disease(doid);", ] create_constraints(constraintCommands, meta_id)
def process(): # load predicate data logger.info("loading data {}", FILE) df = pd.read_csv(os.path.join(dataDir, FILE), sep=",", compression="gzip") logger.info(df.shape) logger.info(df.head()) # create series of subject/object names term_names = pd.concat([df["subject_name"], df["object_name"]]) # create series of subject/object types term_types = pd.concat([df["subject_type"], df["object_type"]]) # create series of subject/object ids term_ids = pd.concat([df["subject_id"], df["object_id"]]) # create new df term_df = pd.concat([term_names, term_types, term_ids], axis=1) term_df.columns = ["name", "type", "id"] # split ids and names by | subject_ids = term_df.id.str.split("|") subject_names = term_df.name.str.split("|") # create dictionary of ids to names dic_list = list(map(dict, map(zip, subject_ids, subject_names))) id_name_dic = {} for d in dic_list: id_name_dic.update(d) # split the IDs by | onto new rows term_df = (term_df.assign( id=term_df.id.str.split("|")).explode("id").reset_index(drop=True)) logger.info(term_df.shape) logger.info("\n {}", term_df) # some ids have multiple types - need to create a list of types for each ID # make a map of id to types id_type_dic = term_df.groupby(["id"])["type"].apply(list).to_dict() # make lists unique for i in id_type_dic: l = ";".join(list(set(id_type_dic[i]))) id_type_dic[i] = l # annoyingly, ids can have multiple names too - so do the same ###note - this has been avoided by id-name dictionary above # make a map of id to names # id_name_dic=term_df.groupby( ['id'] )['name'].apply(list).to_dict() # make lists unique # for i in id_name_dic: # l = ";".join(list(set(id_name_dic[i]))) # id_name_dic[i]=l # create counts by id #term_df = pd.DataFrame({"count": term_df.groupby(["id"]).size()}).reset_index() # add type lists back as ; separated array term_df["type"] = term_df["id"].map(id_type_dic) logger.info("\n {}", term_df) # add names back term_df["name"] = term_df["id"].map(id_name_dic) logger.info("\n {}", term_df) term_df.drop_duplicates(inplace=True) create_import(df=term_df, meta_id=args.name) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (s:LiteratureTerm) ASSERT s.id IS UNIQUE", "CREATE index on :LiteratureTerm(type);", "CREATE index on :LiteratureTerm(name);", "match (l:LiteratureTerm ) match (g:Gene) where toLower(g.name) = toLower(l.name) merge (l)-[:TERM_TO_GENE{_source:\"LiteratureTerm\"}]->(g) return count(g);", ] create_constraints(constraintCommands, meta_id)
def process(): logger.info("loading semrep data... {}", SEM) sem_df = pd.read_csv(os.path.join(dataDir, SEM), sep=",", compression="gzip") #create new ids, if a gene standard id is empty logger.info('Dealing with IDs') sub_id = sem_df.apply(lambda row: make_id(row, 'sub'), axis=1) obj_id = sem_df.apply(lambda row: make_id(row, 'obj'), axis=1) sem_df['sub_id_all'] = sub_id sem_df['obj_id_all'] = obj_id #create new names, if a gene standard name is empty sub_name = sem_df.apply(lambda row: make_name(row, 'sub'), axis=1) obj_name = sem_df.apply(lambda row: make_name(row, 'obj'), axis=1) sem_df['sub_name_all'] = sub_name sem_df['obj_name_all'] = obj_name # need to split subject and object ids by , logger.info(sem_df.shape) sem_df = (sem_df.assign(sub_id_all=sem_df.sub_id_all.str.split( ",")).explode("sub_id_all").reset_index(drop=True)) logger.info(sem_df.shape) sem_df = (sem_df.assign(obj_id_all=sem_df.obj_id_all.str.split( ",")).explode("obj_id_all").reset_index(drop=True)) logger.info(sem_df.shape) sem_id = sem_df['sub_id_all'] + ':' + sem_df['pred'] + ':' + sem_df[ 'obj_id_all'] sem_name = sem_df['sub_name_all'] + ' ' + sem_df['pred'] + ' ' + sem_df[ 'obj_name_all'] logger.debug(sem_id) sem_df['id'] = sem_id sem_df['name'] = sem_name logger.info("\n{}", sem_df) logger.info(sem_df.shape) # merge keep_cols = ['sub_id_all', 'pred', 'obj_id_all', 'id', 'name'] sem_df = sem_df[keep_cols] sem_df.rename(columns={ 'pred': 'predicate', 'sub_id_all': 'subject_id', 'obj_id_all': 'object_id' }, inplace=True) sem_df.drop_duplicates(subset=['id'], inplace=True) logger.info("\n{}", sem_df) logger.info(sem_df.shape) #drop nas/rows with empty string sem_df.replace('', np.nan, inplace=True) sem_df.dropna(inplace=True) logger.info(sem_df.shape) create_import(df=sem_df, meta_id=args.name) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (s:LiteratureTriple) ASSERT s.id IS UNIQUE", "CREATE INDEX ON :LiteratureTriple(name);", "CREATE INDEX ON :LiteratureTriple(subject_id);", "CREATE INDEX ON :LiteratureTriple(object_id);", "CREATE INDEX ON :LiteratureTriple(predicate);", ] create_constraints(constraintCommands, meta_id)