Python create_constraints Examples, workflow.scripts.utils.writers.create_constraints Python Examples

Example #1

0

Show file

File: biomart.py Project: elswob/neo4j-build-pipeline

def gene():
    FILE = get_source(meta_id, 1)
    data = os.path.join(dataDir, FILE)
    df = pd.read_csv(data, sep="\t")
    # add column names
    col_names = [
        "chr",
        "type",
        "name",
        "description",
        "biomart_source",
        "ensembl_id",
        "start",
        "end",
    ]
    df.columns = col_names
    df.drop_duplicates(inplace=True)
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (g:Gene) ASSERT g.ensembl_id IS UNIQUE",
        "CREATE INDEX ON :Gene(name)",
        "CREATE INDEX ON :Gene(chr)",
    ]
    create_constraints(constraintCommands, meta_id)

Example #2

0

Show file

File: semmed.py Project: MRCIEU/epigraphdb-graph

def process():
    # load predicate data
    logger.info("loading predication data...")
    pred_df = pd.read_csv(os.path.join(dataDir, PREDICATION_FILE),
                          sep=",",
                          compression="gzip")
    pred_df["PMID"] = pred_df["PMID"].astype(str)

    logger.info("loading citation data...")
    df = pd.read_csv(os.path.join(dataDir, PUB_FILE),
                     sep="\t",
                     compression="gzip")
    df.columns = ["id", "issn", "dp", "edat", "year"]
    df["id"] = df["id"].str.replace("'", "")
    logger.info(df.shape)

    # merge with predication data
    df_merge = df.merge(pred_df["PMID"], left_on="id", right_on="PMID")
    logger.info(df_merge.shape)
    # drop PMID column
    df_merge.drop("PMID", inplace=True, axis=1)
    # make unique
    df_merge.drop_duplicates(inplace=True)
    logger.info(df_merge.shape)

    logger.info(df_merge.shape)
    logger.info("\n {}", df_merge.head())

    create_import(df=df_merge, meta_id=args.name)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (s:Literature) ASSERT s.id IS UNIQUE;",
    ]
    create_constraints(constraintCommands, meta_id)

Example #3

0

Show file

def process():
    # select the file
    FILE = get_source(meta_id, 1)
    logger.info("Reading {}", FILE)
    df = pd.read_csv(os.path.join(dataDir, FILE))
    # logger.info(df.columns)
    logger.info(df.shape)

    # drop some columns
    df.drop(["access", "priority", "coverage", ""],
            axis=1,
            inplace=True,
            errors="ignore")
    logger.info(df.shape)

    # create the csv and import data
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (g:Gwas) ASSERT g.id IS UNIQUE",
        "CREATE index on :Gwas(trait)",
        "CREATE index on :Gwas(filename)",
    ]
    create_constraints(constraintCommands, meta_id)

Example #4

0

Show file

File: semrep-biorxiv.py Project: MRCIEU/epigraphdb-graph

def process():
    merge = merge_data(BIO_DATA, BIO_SEM)
    logger.info(merge.shape)
    create_import(df=merge, meta_id=args.name)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (s:Literature) ASSERT s.id IS UNIQUE;",
    ]
    create_constraints(constraintCommands, meta_id)

Example #5

0

Show file

def protein():
    data = os.path.join(dataDir, FILE)
    df = pd.read_csv(data, sep="\t")
    df.columns = ["uniprot_id"]
    df["name"] = df["uniprot_id"]
    create_import(df=df, meta_id=meta_id)

    constraintCommands = [
        "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE",
        "CREATE index on :Protein(name);",
    ]
    create_constraints(constraintCommands, meta_id)

Example #6

0

Show file

File: opengwas.py Project: elswob/neo4j-build-pipeline

def process():
    FILE = get_source(meta_id, 1)
    df = pd.read_csv(os.path.join(dataDir, FILE), low_memory=False)
    df = df[["rsid"]].drop_duplicates()
    # change column name to match schema
    df.rename(columns={"rsid": "name"}, inplace=True)

    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (v:Variant) ASSERT v.name IS UNIQUE;",
    ]
    create_constraints(constraintCommands, meta_id)

Example #7

0

Show file

File: reactome.py Project: MRCIEU/epigraphdb-graph

def process():
    df = pd.read_csv(os.path.join(dataDir, FILE))
    logger.info(df.head())
    keep_cols = ["source_id"]
    df = df[keep_cols]
    df.rename(columns={"source_id": "uniprot_id"}, inplace=True)
    df["name"] = df["uniprot_id"]
    df.drop_duplicates(inplace=True)
    logger.info(df.head())

    create_import(df=df, meta_id=meta_id)
    constraintCommands = [
        "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE",
        "CREATE index on :Protein(name);",
    ]
    create_constraints(constraintCommands, meta_id)

Example #8

0

Show file

File: xqtl.py Project: MRCIEU/epigraphdb-graph

def variant():
    print("Reading...", FILE)
    df = pd.read_csv(os.path.join(dataDir, FILE))

    print("Writing...")
    # create csv
    df.rename(columns={"rsid": "name"}, inplace=True)
    df = df[["name"]].drop_duplicates()
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (v:Variant) ASSERT v.name IS UNIQUE;",
        "CREATE index on :Variant(chr);",
        "CREATE index on :Variant(pos);",
    ]
    create_constraints(constraintCommands, meta_id)

Example #9

0

Show file

File: semmed.py Project: MRCIEU/epigraphdb-graph

def process():
    # load predicate data
    logger.info("loading data...")
    df = pd.read_csv(os.path.join(dataDir, PREDICATION_FILE),
                     sep=",",
                     compression="gzip")
    logger.info(df.shape)

    # need to split subject and object ids by |
    df = (df.assign(subject_id=df.subject_id.str.split("|")).explode(
        "subject_id").reset_index(drop=True))
    logger.info(df.shape)
    df = (df.assign(object_id=df.object_id.str.split("|")).explode(
        "object_id").reset_index(drop=True))
    logger.info(df.shape)
    logger.info("\n {}", df)

    df["id"] = df["subject_id"] + ":" + df["predicate"] + ":" + df["object_id"]
    df["name"] = df["subject_name"] + " " + df["predicate"] + " " + df[
        "object_name"]

    # keep_cols = ["predicate","subject_name","object_name","subject_type","object_type","subject_id","object_id","id"]
    keep_cols = ["predicate", "subject_id", "object_id", "id", "name"]

    #df = pd.DataFrame({"count": df.groupby(keep_cols).size()}).reset_index()
    df = df[keep_cols]
    df.drop_duplicates(subset=['id'], inplace=True)
    logger.info(df.shape)
    logger.info("\n {}", df.head())

    #drop nas/rows with empty string
    df.replace('', np.nan, inplace=True)
    df.dropna(inplace=True)

    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (s:LiteratureTriple) ASSERT s.id IS UNIQUE",
        "CREATE INDEX ON :LiteratureTriple(name);",
        "CREATE INDEX ON :LiteratureTriple(subject_id);",
        "CREATE INDEX ON :LiteratureTriple(object_id);",
        "CREATE INDEX ON :LiteratureTriple(predicate);",
    ]
    create_constraints(constraintCommands, meta_id)

Example #10

0

Show file

File: opentargets.py Project: elswob/neo4j-build-pipeline

def process():
    data = os.path.join(dataDir, FILE)
    df = pd.read_csv(data, sep=",")
    logger.info("\n {}", df.head())
    keep_cols = ["molecule_name", "molecule_type", "chembl_uri"]
    df = df[keep_cols]
    col_names = ["label", "molecule_type", "id"]
    df.columns = col_names
    df.drop_duplicates(inplace=True)
    # set label to uppercase
    df["label"] = df["label"].str.upper()
    logger.info(df.shape)
    logger.info("\n {}", df.head())
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = ["CREATE index on :Drug(label);"]
    create_constraints(constraintCommands, meta_id)

Example #11

0

Show file

File: reactome.py Project: MRCIEU/epigraphdb-graph

def process():
    df = pd.read_csv(os.path.join(dataDir, FILE))
    logger.info(df.head())
    keep_cols = ["reactome_id", "name"]
    df = df[keep_cols]
    df["url"] = "https://reactome.org/PathwayBrowser/#/" + df["reactome_id"]
    df.rename(columns={"reactome_id": "id"}, inplace=True)
    df.drop_duplicates(inplace=True)
    logger.info(df["url"].head())

    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (p:Pathway) ASSERT p.id IS UNIQUE",
        "CREATE index on :Pathway(name);",
    ]
    create_constraints(constraintCommands, meta_id)

Example #12

0

Show file

File: cpic.py Project: MRCIEU/epigraphdb-graph

def process():
    data = os.path.join(dataDir, FILE)
    # some bad rows caused by extra commas so need to skip
    df = pd.read_csv(data, sep=",", skiprows=1, error_bad_lines=False)
    keep_cols = ["Drug"]
    df = df[keep_cols]
    df.drop_duplicates(inplace=True)
    df.columns = ["label"]
    # set label to uppercase
    df["label"] = df["label"].str.upper()
    logger.info(df.shape)
    logger.info("\n {}", df.head())
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (s:Drug) ASSERT s.label IS UNIQUE;"
    ]
    create_constraints(constraintCommands, meta_id)

Example #13

0

Show file

File: gtex.py Project: MRCIEU/epigraphdb-graph

def process():
    data = os.path.join(dataDir, FILE)
    gtex_df = pd.read_csv(data, sep="\t", skiprows=2)

    # create dataframe from tissue name columns
    tissue_names = list(gtex_df.columns)[2:]
    df = pd.DataFrame(tissue_names)
    df.columns = ["id"]
    df['name'] = df['id']

    logger.info(df.shape)
    logger.info("\n {}", df.head())
    df.drop_duplicates(inplace=True)
    create_import(df=df, meta_id=meta_id)

    constraintCommands = [
        "CREATE CONSTRAINT ON (t:Tissue) ASSERT t.id IS UNIQUE",
        "CREATE CONSTRAINT ON (t:Tissue) ASSERT t.name IS UNIQUE",
    ]
    create_constraints(constraintCommands, meta_id)

Example #14

0

Show file

def process():
    data = os.path.join(dataDir, FILE)
    df = pd.read_csv(data, sep="\t")

    keep_cols = [
        "ensembl_gene_id",
        "druggability_tier",
        "chr_b37",
        "start_b37",
        "end_b37",
        "description",
        "small_mol_druggable",
        "adme_gene",
        "bio_druggable",
        "hgnc_names",
    ]

    df = df[keep_cols]
    df.rename(
        columns={
            "ensembl_gene_id": "ensembl_id",
            "chr_b37": "chr",
            "start_b37": "start",
            "end_b37": "end",
            "hgnc_names": "name",
        },
        inplace=True,
    )
    logger.info(df.shape)
    logger.info("\n {}", df.head())

    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE INDEX ON :Gene(druggability_tier);",
        "CREATE CONSTRAINT ON (g:Gene) ASSERT g.ensembl_id IS UNIQUE",
        "CREATE INDEX ON :Gene(name)",
        "CREATE INDEX ON :Gene(chr)",
    ]
    create_constraints(constraintCommands, meta_id)

Example #15

0

Show file

def process():
    data = os.path.join(dataDir, FILE)
    # some bad rows caused by extra commas so need to skip
    df = pd.read_csv(data, sep=",", skiprows=1, error_bad_lines=False)
    keep_cols = [
        "Gene",
        "Drug",
        "Guideline",
        "CPIC Level",
        "PharmGKB Level of Evidence",
        "PGx on FDA Label",
    ]
    df = df[keep_cols]
    df.drop_duplicates(inplace=True)
    df.columns = [
        "target",
        "source",
        "guideline",
        "cpic_level",
        "pharmgkb_level_of_evidence",
        "pgx_on_fda_label",
    ]
    # set label to uppercase
    df["source"] = df["source"].str.upper()
    logger.info(df.shape)
    logger.info("\n {}", df.head())
    create_import(df=df, meta_id=meta_id, import_type="load")

    load_text = f"""
		USING periodic commit 10000 
		load CSV from "file:///rels/{meta_id}/{meta_id}.csv.gz" as row FIELDTERMINATOR "," 
		WITH row, coalesce(row[2],"NA") as guideline_data, coalesce(row[3],"NA") as level_data, coalesce(row[4],"NA") as pharmgkb_data, coalesce(row[5],"NA") as pgx_data 
		match  (g:Gene{{name:row[0]}}) match (d:Drug{{label:row[1]}})
		merge (g)<-[c:CPIC{{guideline:guideline_data, cpic_level:level_data, pharmgkb_level_of_evidence:pharmgkb_data, pgx_on_fda_label:pgx_data,_source:["CPIC"]}}]-(d)
        return count(g);
		"""
    load_text = load_text.replace("\n", " ").replace("\t", " ")
    load_text = " ".join(load_text.split())

    create_constraints([load_text], meta_id)

Example #16

0

Show file

def link():
    # map to ontologies
    load_text = []
    # efo
    load_text.append(f"""
        USING PERIODIC COMMIT 10000 
		LOAD CSV FROM "file:///nodes/{args.name}/efo.csv" AS row FIELDTERMINATOR "," 
		WITH row 
		MATCH (e:Efo) where e.id = "http://www.ebi.ac.uk/efo/EFO_"+row[0] MATCH (d:Disease) where d.id = row[1] 
		MERGE (e)<-[:MONDO_MAP_EFO{{_source:"Mondo"}}]-(d) return count(e) as efo_count
        """)
    # umls
    load_text.append(f"""
        USING PERIODIC COMMIT 10000 
		LOAD CSV FROM "file:///nodes/{args.name}/umls.csv" AS row FIELDTERMINATOR "," 
		WITH row 
		MATCH (s:LiteratureTerm) where s.id = row[0] MATCH (d:Disease) where d.id = row[1] 
		MERGE (s)<-[:MONDO_MAP_UMLS{{_source:"Mondo"}}]-(d) return count(s) as umls_count
        """)

    #umls using text
    load_text.append(f"""
        MATCH 
            (l:LiteratureTerm) 
        MATCH
            (d:Disease) 
        WHERE 
            toLower(l.name) = toLower(d.label) 
        MERGE 
            (l)<-[:MONDO_MAP_UMLS{{_source:"Mondo"}}]-(d)
        RETURN
            count(d)
        """)

    load_text = [t.replace("\n", " ").replace("\t", " ") for t in load_text]
    # load_text = " ".join(load_text.split())

    create_constraints(load_text, args.name)

Example #17

0

Show file

File: string.py Project: MRCIEU/epigraphdb-graph

def process():
    df1 = pd.read_csv(os.path.join(dataDir, FILE1), sep=" ")
    # filter by score
    df1 = df1[df1["combined_score"] >= 700]
    logger.info(df1.shape)
    logger.info("\n {}", df1.head())

    p1 = list(df1["protein1"])
    p2 = list(df1["protein2"])
    dfp = pd.DataFrame(p1 + p2)
    dfp.drop_duplicates(inplace=True)
    dfp.columns = ["protein"]
    logger.info(dfp.shape)
    logger.info("\n {}", dfp.head())

    df2 = pd.read_csv(os.path.join(dataDir, FILE2), sep="\t")
    df2.columns = ["species", "uniprot", "protein", "x", "y"]
    df2["uniprot"] = df2["uniprot"].str.split("|", expand=True)[0]
    logger.info(df2.shape)
    logger.info("\n {}", df2.head())

    # merge
    df_merge = dfp.merge(df2, left_on="protein", right_on="protein")
    df_merge = pd.DataFrame(df_merge["uniprot"])
    df_merge.columns = ["uniprot_id"]
    df_merge['name'] = df_merge["uniprot_id"]
    df_merge.drop_duplicates(inplace=True)
    logger.info(df_merge.shape)
    logger.info("\n {}", df_merge.head())
    create_import(df=df_merge, meta_id=args.name)

    constraintCommands = [
        "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE",
        "CREATE index on :Protein(name);",
    ]
    create_constraints(constraintCommands, meta_id)

Example #18

0

Show file

File: semrep-medrxiv.py Project: MRCIEU/epigraphdb-graph

def process():
    logger.info("loading semrep data...{}", SEM)
    sem_df = pd.read_csv(os.path.join(dataDir, SEM),
                         sep=",",
                         compression="gzip")

    #need to deal with cases where there is no id and only a gene_id
    logger.info('Dealing with IDs')
    sub_id = sem_df.apply(lambda row: make_id(row, 'sub'), axis=1)
    obj_id = sem_df.apply(lambda row: make_id(row, 'obj'), axis=1)
    sem_df['sub_id_all'] = sub_id
    sem_df['obj_id_all'] = obj_id

    logger.info("\n{}", sem_df)
    logger.info(sem_df.shape)

    # merge
    keep_cols = [
        'sub_name'
        'sub_type'
        'sub_id_all', 'obj_id_all', 'obj_name'
        'obj_type'
    ]

    # need to split subject and object ids by ,
    logger.info(sem_df.shape)
    sem_df = (sem_df.assign(sub_id_all=sem_df.sub_id_all.str.split(
        ",")).explode("sub_id_all").reset_index(drop=True))
    logger.info(sem_df.shape)
    sem_df = (sem_df.assign(obj_id_all=sem_df.obj_id_all.str.split(
        ",")).explode("obj_id_all").reset_index(drop=True))
    logger.info(sem_df.shape)

    # create series of subject/object names
    term_names = pd.concat([sem_df["sub_name"], sem_df["obj_name"]])
    # create series of subject/object types
    term_types = pd.concat([sem_df["sub_type"], sem_df["obj_type"]])
    # create series of subject/object ids
    term_ids = pd.concat([sem_df["sub_id_all"], sem_df["obj_id_all"]])
    # create new df
    term_df = pd.concat([term_names, term_types, term_ids], axis=1)
    term_df.columns = ["name", "type", "id"]

    term_df.drop_duplicates(inplace=True)
    logger.info("\n{}", term_df)
    logger.info(term_df.shape)

    # some ids have multiple types - need to create a list of types for each ID
    # make a map of id to types
    id_type_dic = term_df.groupby(["id"])["type"].apply(list).to_dict()
    # make lists unique
    for i in id_type_dic:
        l = ";".join(list(set(id_type_dic[i])))
        id_type_dic[i] = l
    # add type lists back as ; separated array
    term_df["type"] = term_df["id"].map(id_type_dic)
    term_df.drop_duplicates(inplace=True)
    logger.info("\n {}", term_df)

    create_import(df=term_df, meta_id=args.name)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (s:LiteratureTerm) ASSERT s.id IS UNIQUE",
        "CREATE index on :LiteratureTerm(type);",
        "CREATE index on :LiteratureTerm(name);",
        "match (l:LiteratureTerm ) match (g:Gene) where toLower(g.name) = toLower(l.name) merge (l)-[:TERM_TO_GENE{_source:\"LiteratureTerm\"}]->(g) return count(g);",
    ]
    create_constraints(constraintCommands, meta_id)

Example #19

0

Show file

def process():
    print("Processing")
    csv_data = []
    print("Reading")
    masterOnt = create_ontoDic()
    with open(os.path.join(dataDir, FILE)) as json_file:
        data = json.load(json_file)
        for d in data["graphs"][0]["nodes"]:
            # only CLASS types
            mondo_type = d["type"]
            if mondo_type != "CLASS":
                continue
            # skip deprecated
            if "meta" in d:
                if "deprecated" in d["meta"]:
                    if d["meta"]["deprecated"] == True:
                        continue
            #logger.debug(d)
            if "lbl" not in d:
                continue
            mondo_id = d["id"]
            mondo_label = d["lbl"]
            definition = "NA"
            xrefs = []
            ontoDic = create_ontoDic()
            if "meta" in d:
                if "definition" in d["meta"]:
                    definition = d["meta"]["definition"]["val"].replace(
                        '"', "").replace('\n', ' ')
                if "xrefs" in d["meta"]:
                    for x in d["meta"]["xrefs"]:
                        xrefs.append(x["val"])
                    for i in xrefs:
                        ont, val = i.split(":", 1)
                        if ont.lower() in masterOnt:
                            masterOnt[ont.lower()].append({
                                "val": val,
                                "mondo": mondo_id
                            })
                    ontoDic = create_ontology_properties(xrefs)
            # print(mondo_id,mondo_label,definition,xrefs)
            # print(list(ontoDic.values()))
            oList = [";".join(o) for o in list(ontoDic.values())]
            # print("\t".join(oList))
            d = [mondo_id, mondo_label, definition]
            d.extend(oList)
            csv_data.append(d)

    # print(masterOnt)
    for m in masterOnt:
        o = open(os.path.join(dataDir, m + ".csv"), "w")
        for i in masterOnt[m]:
            o.write(i["val"] + "," + i["mondo"] + "\n")
        o.close()

    # create csv file
    df = pd.DataFrame(csv_data)

    col_names = ["id", "label", "definition"]
    ontoLabels = list(ontoDic.keys())
    col_names.extend(ontoLabels)
    df.columns = col_names
    print(df.head())
    create_import(df=df, meta_id=meta_id)

    # create the constraints and indexes
    constraintCommands = [
        "CREATE CONSTRAINT ON (d:Disease) ASSERT d.id IS UNIQUE",
        "CREATE index on :Disease(label);",
        "CREATE index on :Disease(doid);",
    ]
    create_constraints(constraintCommands, meta_id)

Example #20

0

Show file

File: semmed.py Project: MRCIEU/epigraphdb-graph

def process():
    # load predicate data
    logger.info("loading data {}", FILE)
    df = pd.read_csv(os.path.join(dataDir, FILE), sep=",", compression="gzip")
    logger.info(df.shape)
    logger.info(df.head())

    # create series of subject/object names
    term_names = pd.concat([df["subject_name"], df["object_name"]])
    # create series of subject/object types
    term_types = pd.concat([df["subject_type"], df["object_type"]])
    # create series of subject/object ids
    term_ids = pd.concat([df["subject_id"], df["object_id"]])
    # create new df
    term_df = pd.concat([term_names, term_types, term_ids], axis=1)
    term_df.columns = ["name", "type", "id"]

    # split ids and names by |
    subject_ids = term_df.id.str.split("|")
    subject_names = term_df.name.str.split("|")
    # create dictionary of ids to names
    dic_list = list(map(dict, map(zip, subject_ids, subject_names)))
    id_name_dic = {}
    for d in dic_list:
        id_name_dic.update(d)

    # split the IDs by | onto new rows
    term_df = (term_df.assign(
        id=term_df.id.str.split("|")).explode("id").reset_index(drop=True))
    logger.info(term_df.shape)
    logger.info("\n {}", term_df)

    # some ids have multiple types - need to create a list of types for each ID
    # make a map of id to types
    id_type_dic = term_df.groupby(["id"])["type"].apply(list).to_dict()
    # make lists unique
    for i in id_type_dic:
        l = ";".join(list(set(id_type_dic[i])))
        id_type_dic[i] = l

    # annoyingly, ids can have multiple names too - so do the same
    ###note - this has been avoided by id-name dictionary above
    # make a map of id to names
    # id_name_dic=term_df.groupby( ['id'] )['name'].apply(list).to_dict()
    # make lists unique
    # for i in id_name_dic:
    #    l = ";".join(list(set(id_name_dic[i])))
    #    id_name_dic[i]=l

    # create counts by id
    #term_df = pd.DataFrame({"count": term_df.groupby(["id"]).size()}).reset_index()

    # add type lists back as ; separated array
    term_df["type"] = term_df["id"].map(id_type_dic)
    logger.info("\n {}", term_df)
    # add names back
    term_df["name"] = term_df["id"].map(id_name_dic)
    logger.info("\n {}", term_df)
    term_df.drop_duplicates(inplace=True)
    create_import(df=term_df, meta_id=args.name)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (s:LiteratureTerm) ASSERT s.id IS UNIQUE",
        "CREATE index on :LiteratureTerm(type);",
        "CREATE index on :LiteratureTerm(name);",
        "match (l:LiteratureTerm ) match (g:Gene) where toLower(g.name) = toLower(l.name) merge (l)-[:TERM_TO_GENE{_source:\"LiteratureTerm\"}]->(g) return count(g);",
    ]
    create_constraints(constraintCommands, meta_id)

Example #21

0

Show file

File: semrep-biorxiv.py Project: MRCIEU/epigraphdb-graph

def process():
    logger.info("loading semrep data... {}", SEM)
    sem_df = pd.read_csv(os.path.join(dataDir, SEM),
                         sep=",",
                         compression="gzip")

    #create new ids, if a gene standard id is empty
    logger.info('Dealing with IDs')
    sub_id = sem_df.apply(lambda row: make_id(row, 'sub'), axis=1)
    obj_id = sem_df.apply(lambda row: make_id(row, 'obj'), axis=1)
    sem_df['sub_id_all'] = sub_id
    sem_df['obj_id_all'] = obj_id

    #create new names, if a gene standard name is empty
    sub_name = sem_df.apply(lambda row: make_name(row, 'sub'), axis=1)
    obj_name = sem_df.apply(lambda row: make_name(row, 'obj'), axis=1)
    sem_df['sub_name_all'] = sub_name
    sem_df['obj_name_all'] = obj_name

    # need to split subject and object ids by ,
    logger.info(sem_df.shape)
    sem_df = (sem_df.assign(sub_id_all=sem_df.sub_id_all.str.split(
        ",")).explode("sub_id_all").reset_index(drop=True))
    logger.info(sem_df.shape)
    sem_df = (sem_df.assign(obj_id_all=sem_df.obj_id_all.str.split(
        ",")).explode("obj_id_all").reset_index(drop=True))
    logger.info(sem_df.shape)

    sem_id = sem_df['sub_id_all'] + ':' + sem_df['pred'] + ':' + sem_df[
        'obj_id_all']
    sem_name = sem_df['sub_name_all'] + ' ' + sem_df['pred'] + ' ' + sem_df[
        'obj_name_all']
    logger.debug(sem_id)
    sem_df['id'] = sem_id
    sem_df['name'] = sem_name
    logger.info("\n{}", sem_df)
    logger.info(sem_df.shape)

    # merge
    keep_cols = ['sub_id_all', 'pred', 'obj_id_all', 'id', 'name']
    sem_df = sem_df[keep_cols]
    sem_df.rename(columns={
        'pred': 'predicate',
        'sub_id_all': 'subject_id',
        'obj_id_all': 'object_id'
    },
                  inplace=True)
    sem_df.drop_duplicates(subset=['id'], inplace=True)
    logger.info("\n{}", sem_df)
    logger.info(sem_df.shape)

    #drop nas/rows with empty string
    sem_df.replace('', np.nan, inplace=True)
    sem_df.dropna(inplace=True)
    logger.info(sem_df.shape)

    create_import(df=sem_df, meta_id=args.name)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (s:LiteratureTriple) ASSERT s.id IS UNIQUE",
        "CREATE INDEX ON :LiteratureTriple(name);",
        "CREATE INDEX ON :LiteratureTriple(subject_id);",
        "CREATE INDEX ON :LiteratureTriple(object_id);",
        "CREATE INDEX ON :LiteratureTriple(predicate);",
    ]
    create_constraints(constraintCommands, meta_id)