Beispiel #1
0
def gene():
    FILE = get_source(meta_id, 1)
    data = os.path.join(dataDir, FILE)
    df = pd.read_csv(data, sep="\t")
    # add column names
    col_names = [
        "chr",
        "type",
        "name",
        "description",
        "biomart_source",
        "ensembl_id",
        "start",
        "end",
    ]
    df.columns = col_names
    df.drop_duplicates(inplace=True)
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (g:Gene) ASSERT g.ensembl_id IS UNIQUE",
        "CREATE INDEX ON :Gene(name)",
        "CREATE INDEX ON :Gene(chr)",
    ]
    create_constraints(constraintCommands, meta_id)
Beispiel #2
0
def process():
    # select the file
    FILE = get_source(meta_id, 1)
    logger.info("Reading {}", FILE)
    df = pd.read_csv(os.path.join(dataDir, FILE))
    # logger.info(df.columns)
    logger.info(df.shape)

    # drop some columns
    df.drop(["access", "priority", "coverage", ""],
            axis=1,
            inplace=True,
            errors="ignore")
    logger.info(df.shape)

    # create the csv and import data
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (g:Gwas) ASSERT g.id IS UNIQUE",
        "CREATE index on :Gwas(trait)",
        "CREATE index on :Gwas(filename)",
    ]
    create_constraints(constraintCommands, meta_id)
Beispiel #3
0
def protein():
    FILE = get_source(meta_id, 1)
    data = os.path.join(dataDir, FILE)
    df = pd.read_csv(data, sep="\t")
    df.columns = ["uniprot_id"]
    df["name"] = df["uniprot_id"]
    create_import(df=df, meta_id=meta_id)

    constraintCommands = [
        "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE",
    ]
    create_constraints(constraintCommands, meta_id)
def process():
    FILE = get_source(meta_id, 1)
    df = pd.read_csv(os.path.join(dataDir, FILE), low_memory=False)
    df = df[["rsid"]].drop_duplicates()
    # change column name to match schema
    df.rename(columns={"rsid": "name"}, inplace=True)

    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (v:Variant) ASSERT v.name IS UNIQUE;",
    ]
    create_constraints(constraintCommands, meta_id)
Beispiel #5
0
from workflow.scripts.utils.writers import (
    create_constraints,
    create_import,
)

# setup
args, dataDir = setup()
meta_id = args.name

# args = the argparse arguments (name and data)
# dataDir = the path to the working directory for this node/rel

#######################################################################

FILE = get_source(meta_id,1)

def process():
    # select the file
    logger.info("Reading {}", FILE)
    df = pd.read_csv(os.path.join(dataDir, FILE))
    # logger.info(df.columns)
    logger.info(df.shape)

    # drop some columns
    df.drop(
        ["access", "priority", "coverage", "doi", "group_name", "imputation_panel", "ontology", "study_design", "covariates",""], axis=1, inplace=True, errors="ignore"
    )
    logger.info(df.shape)

    # create the csv and import data
from workflow.scripts.utils.writers import (
    create_constraints,
    create_import,
)

# setup
args, dataDir = setup()
meta_id = args.name

# args = the argparse arguments (name and data)
# dataDir = the path to the working directory for this node/rel

#######################################################################

BIO_DATA = get_source(meta_id, 1)
BIO_SEM = get_source(meta_id, 2)


def merge_data(lit_data, sem_data):
    # load predicate data
    logger.info("loading data...")
    data_df = pd.read_csv(os.path.join(dataDir, lit_data),
                          sep=",",
                          compression="gzip")
    logger.info("\n{}", data_df)

    logger.info("loading semrep data...")
    sem_df = pd.read_csv(os.path.join(dataDir, sem_data),
                         sep=",",
                         compression="gzip")
Beispiel #7
0
from workflow.scripts.utils.writers import (
    create_constraints,
    create_import,
)

# setup
args, dataDir = setup()
meta_id = args.name

# args = the argparse arguments (name and data)
# dataDir = the path to the working directory for this node/rel

#######################################################################

FILE1 = get_source(meta_id, 1)
FILE2 = get_source(meta_id, 2)


def process():
    df1 = pd.read_csv(os.path.join(dataDir, FILE1), sep=" ")
    # filter by score
    df1 = df1[df1["combined_score"] >= 700]
    logger.info(df1.shape)
    logger.info("\n {}", df1.head())

    df2 = pd.read_csv(os.path.join(dataDir, FILE2), sep="\t")
    df2.columns = ["species", "uniprot", "protein", "x", "y"]
    df2["uniprot"] = df2["uniprot"].str.split("|", expand=True)[0]
    logger.info(df2.shape)
    logger.info("\n {}", df2.head())
Beispiel #8
0
from workflow.scripts.utils.writers import (
    create_constraints,
    create_import,
)

# setup
args, dataDir = setup()
meta_id = args.name

# args = the argparse arguments (name and data)
# dataDir = the path to the working directory for this node/rel

#######################################################################

PREDICATION_FILE = get_source(meta_id, 1)


def process():
    # load predicate data
    logger.info("loading data...")
    df = pd.read_csv(os.path.join(dataDir, PREDICATION_FILE),
                     sep=",",
                     compression="gzip")
    logger.info(df.shape)

    # need to split subject and object ids by |
    df = (df.assign(subject_id=df.subject_id.str.split("|")).explode(
        "subject_id").reset_index(drop=True))
    logger.info(df.shape)
    df = (df.assign(object_id=df.object_id.str.split("|")).explode(
Beispiel #9
0
from workflow.scripts.utils.writers import (
    create_constraints,
    create_import,
)

# setup
args, dataDir = setup()
meta_id = args.name

# args = the argparse arguements (name and data)
# dataDir = the path to the working directory for this node/rel

#######################################################################

vep_data = get_source(meta_id, 1)


def process_data():
    logger.info("Processing vep data {}", vep_data)
    col_names = [
        "source",
        "location",
        "allele",
        "target",
        "feature",
        "feature_type",
        "consequence",
        "cdna_position",
        "cds_position",
        "protein_position",
from workflow.scripts.utils.writers import (
    create_constraints,
    create_import,
)

# setup
args, dataDir = setup()
meta_id = args.name

# args = the argparse arguments (name and data)
# dataDir = the path to the working directory for this node/rel

#######################################################################

MED_DATA = get_source(meta_id, 1)
MED_SEM = get_source(meta_id, 2)


def merge_data(lit_data, sem_data):
    # load predicate data
    logger.info("loading data...")
    data_df = pd.read_csv(os.path.join(dataDir, lit_data),
                          sep=",",
                          compression="gzip")
    logger.info("\n{}", data_df)

    logger.info("loading semrep data...")
    sem_df = pd.read_csv(os.path.join(dataDir, sem_data),
                         sep=",",
                         compression="gzip")
from workflow.scripts.utils.writers import (
    create_constraints,
    create_import,
)

# setup
args, dataDir = setup()
meta_id = args.name

# args = the argparse arguments (name and data)
# dataDir = the path to the working directory for this node/rel

#######################################################################

SEM = get_source(meta_id, 1)


def make_id(row, sub_type):
    id_val = row[sub_type + '_id']
    if pd.isna(row[sub_type + '_id']):
        id_val = row[sub_type + '_gene_id']
    return id_val


def process():
    logger.info("loading semrep data...{}", SEM)
    sem_df = pd.read_csv(os.path.join(dataDir, SEM),
                         sep=",",
                         compression="gzip")