Beispiel #1
0
def df_check(df=[], meta_id=""):
    source_data = get_meta_data(meta_id=meta_id)
    meta_name = source_data["name"]
    meta_type = source_data["d_type"]
    schema_data = get_schema_data(meta_name=meta_name)
    # check index column is unique
    if "index" in schema_data:
        index_property = schema_data["index"]
        dup_check(df, index_property)

    outDir = make_outDir(meta_id)
    df_types = df.dtypes.apply(lambda x: x.name).to_dict()
    header = compare_df_to_schema(df_types, schema_data, meta_type)
    return header
Beispiel #2
0
def single_source(meta_id=""):
    meta_data = get_meta_data(meta_id=meta_id)
    out_dir = make_outDir(meta_id=meta_id)
    csv_file = os.path.join(out_dir, meta_id + ".csv.gz")
    csv_header = os.path.join(out_dir, meta_id + ".header")

    # create symlinks for import statements
    source_file = os.path.join(out_dir, meta_id + "-import-nodes.txt")
    target_file = os.path.join(merge_dir,
                               meta_data["name"] + "-import-nodes.txt")
    create_sym_link(source=source_file, target=target_file)

    # create symlinks for constraint statements
    source_file = os.path.join(out_dir, meta_id + "-constraint.txt")
    target_file = os.path.join(merge_dir,
                               meta_data["name"] + "-constraint.txt")
    if os.path.exists(source_file):
        create_sym_link(source=source_file, target=target_file)
Beispiel #3
0
def get_source_data(dname="all"):
    logger.info('Running merge with "{}" data types', dname)
    source_data = get_meta_data(meta_id="all")
    node_d = find_multiple(source_data, "nodes")

    if not dname == "all":
        node_d = {dname: node_d[dname]}
    logger.debug(node_d)
    for i in node_d:
        # check if already done
        f = os.path.join(merge_dir, i + ".csv.gz")
        logger.debug("Checking if already done {}", f)
        if os.path.exists(f):
            logger.info("Already processed {}", i)
        else:
            logger.info("Processing node: {} ...", i)
            if len(node_d[i]) > 1:
                df_merged = merge_source(node_d[i])
                write_new_merged_files(df_merged, i)
            else:
                single_source(node_d[i][0])
            logger.info("Processed node: {}", i)
Beispiel #4
0
from workflow.scripts.utils import settings
from workflow.scripts.utils.general import get_meta_data

env_configs = settings.env_configs

graph_bolt_port = env_configs["graph_bolt"]
graph_user = env_configs["graph_user"]
graph_password = env_configs["graph_pass"]
neo4j_import_dir = env_configs["neo4j_import_dir"]

constraints = []
import_nodes = []
import_rels = []

source_data = get_meta_data(meta_id="all")

# loop through nodes merged directory
d = os.path.join(neo4j_import_dir, "nodes", "merged")
for filename in os.listdir(d):
    if filename.endswith("constraint.txt"):
        with open(os.path.join(d, filename)) as f:
            for line in f:
                if not line.startswith("#"):
                    constraints.append("echo '" + line.rstrip() + "'")
                    constraints.append(
                        "cypher-shell -a bolt://localhost:"
                        + graph_bolt_port
                        + " -u "
                        + graph_user
                        + " -p "
Beispiel #5
0
def merge_source(meta_ids=[]):
    logger.debug("multi source {}", meta_ids)
    data_frames = []
    index_col = ""
    for i in meta_ids:
        logger.info("Processing meta_id: {}", i)
        meta_data = get_meta_data(i)
        schema_data = get_schema_data(meta_data["name"])
        logger.debug(schema_data)
        out_dir = make_outDir(meta_id=i)
        if not args.nrows is None:
            args.nrows = int(args.nrows)
        df = create_df(data_dir=out_dir, name=i, nrows=args.nrows)
        # make index column a string to avoid merge issues, e.g. float and object
        index_col = f"{schema_data['index']}:ID({meta_data['name']}-ID)"
        # logger.debug('index_col {}',index_col)
        # don't need to fix int/float issues anymore as reading everything in as strings
        # df = column_zero_fix(df)
        logger.debug("\n{}", df.head())
        logger.debug("\n{}", df.dtypes)
        data_frames.append(df)

        # get the constraints (not sure how to deal with multiple constraint files, assume they are the same...?)
        source_file = os.path.join(out_dir, i + "-constraint.txt")
        target_file = os.path.join(merge_dir,
                                   meta_data["name"] + "-constraint.txt")
        if os.path.exists(source_file):
            create_sym_link(source=source_file, target=target_file)

    logger.debug("index column: {}", index_col)
    # merge the dataframes on index
    logger.info("Merging {}", meta_ids)
    df_merged = reduce(
        lambda left, right: pd.merge(left, right, on=[index_col], how="outer"),
        data_frames,
    ).fillna("")
    logger.debug("\n{}", df_merged.head())

    # find duplicate source columns and aggregate
    source_cols = df_merged.filter(regex="^_source.*", axis=1)
    logger.info("Aggregating source columns {}", source_cols.columns)
    # aggregate into neo4j array style (separated by ;)
    source_agg = source_cols.agg(lambda x: ";".join(y for y in x if y != ""),
                                 axis=1)
    logger.debug("\n{}", source_agg.value_counts())
    # drop the merge source columns
    drop_cols = list(df_merged.filter(regex="^_source.*"))
    logger.debug("dropping cols {}", drop_cols)
    df_merged.drop(drop_cols, inplace=True, axis=1)
    # df_merged = df_merged[df_merged.columns.drop(drop_cols)]
    df_merged["_source:string[]"] = source_agg

    # check for column conflicts, e.g. b_x and b_y
    logger.info("Running conflict check with {} threads", THREADS)
    df_merged = column_conflict_check(df_merged)

    logger.debug("\n{}", df_merged.head())

    # issue with merging adding .0 to integers
    df = column_zero_fix(df_merged)

    # convert entire df to strings as don't need integers for neo4j import
    df_merged = df_merged.applymap(str)

    # need to convert nan to empty string
    df_merged = df_merged.replace("nan", "")
    df_merged = df_merged.replace("None", "")
    # logger.debug("\n{}",df_merged)

    return df_merged
Beispiel #6
0
def create_import(df=[], meta_id="", import_type="import"):
    # qc the df
    schema_cols = df_check(df, meta_id)
    logger.info("Matched these columns {}", schema_cols)

    # add source column to node headers and df if node
    # meta_data = get_meta_data(meta_id)
    # if meta_data["d_type"] == "nodes":
    #    schema_cols.append("source:string[]")
    #    df["source:string[]"] = meta_data["source"]

    # add source info to nodes and rels
    meta_data = get_meta_data(meta_id)
    schema_cols.append("_source:string[]")
    df["_source:string[]"] = meta_data["source"]

    # add meta cols _name and _id to nodes
    if meta_data["d_type"] == "nodes":
        source_data = get_meta_data(meta_id=meta_id)
        meta_name = source_data["name"]
        schema_data = get_schema_data(meta_name=meta_name)
        logger.debug(schema_data)

        node_meta = node_meta_check(schema_data)
        # get type for _name and _id col
        name_col_type = schema_data["properties"][node_meta["_name"]]["type"]
        name_col_text = f"_name:{name_col_type}"
        id_col_type = schema_data["properties"][node_meta["_id"]]["type"]
        id_col_text = f"_id:{id_col_type}"

        # add to schema cols
        schema_cols.extend([name_col_text, id_col_text])

        # add to dataframe
        df[name_col_text] = df[node_meta["_name"]]
        df[id_col_text] = df[node_meta["_id"]]
        logger.debug("\n{}", df.head())

        # add indexes for meta properties
        constraintCommands = [
            f"CREATE index on :{meta_name}(_name);",
            f"CREATE index on :{meta_name}(_id);",
        ]
        create_constraints(constraintCommands, meta_id)

    # create copy of header for import creation
    logger.info("Creating import statement")
    import_header = schema_cols.copy()
    create_import_commands(header=import_header,
                           meta_id=meta_id,
                           import_type=import_type)

    outDir = make_outDir(meta_id)
    # logger.debug(outDir)
    file_name = os.path.join(outDir, meta_id + ".csv.gz")
    df.to_csv(file_name,
              index=False,
              header=False,
              compression="gzip",
              columns=schema_cols)

    # run pandas profiling
    com = f"sh workflow/scripts/utils/pandas-profiling.sh {outDir} {meta_id} {THREADS}"
    logger.debug(com)
    try:
        out = subprocess.check_output(com, shell=True)
        logger.info(out)
    except:
        logger.error(
            "Pandas profiling didn't work, perhaps you haven't installed shuf, see README.md?"
        )
        exit()

    # backup
    backup_processed_data(outDir, meta_id, meta_data["d_type"])
Beispiel #7
0
def create_import_commands(header, meta_id, import_type):
    outDir = make_outDir(meta_id)
    metaData = get_meta_data(meta_id)
    source_data = get_meta_data(meta_id=meta_id)
    meta_name = source_data["name"]
    meta_type = source_data["d_type"]
    schema_data = get_schema_data(meta_name=meta_name)
    # logger.debug(schema_data)

    if meta_type == "nodes":
        # convert node ID property to neo4j style
        if "index" in schema_data:
            index_property = schema_data["index"]
            li = header.index(index_property)
            logger.info("Index = {} {}", index_property, li)
            header[li] = index_property + ":ID(" + meta_name + "-ID)"
            logger.info(header)
        else:
            logger.error("Schema has no index, exiting")
            exit()
        # add meta _name and _id
        node_meta = node_meta_check(schema_data)
        # header.extend(['_name','_id'])

    elif meta_type == "rels":
        # convert relationships source/target properties to neo4j START END style
        source_index = header.index("source")
        source_id = schema_data["properties"]["source"]["type"]
        target_index = header.index("target")
        target_id = schema_data["properties"]["target"]["type"]
        header[source_index] = ":START_ID(" + source_id + "-ID)"
        header[target_index] = ":END_ID(" + target_id + "-ID)"

    # add property types
    for i, item in enumerate(header):
        if item in schema_data["properties"]:
            property_type = schema_data["properties"][item]["type"]
            # deal with arrays
            if property_type == "array":
                items_type = schema_data["properties"][item]["items"]["type"]
                property_type = f"{items_type}[]"
            elif property_type == "integer":
                property_type = "int"
            header[i] = item + ":" + property_type

    write_header(
        dir=outDir,
        headerData={
            "fileName": meta_id + ".header",
            "data": ",".join(header),
        },
    )
    # don't create import statements for load csv data
    if not import_type == "load":
        write_import(
            id=meta_id,
            dir=outDir,
            importCommands=[{
                "type":
                metaData["d_type"],
                "name":
                metaData["name"],
                "file":
                os.path.join("import", metaData["d_type"], meta_id,
                             meta_id + ".csv.gz"),
                "header":
                os.path.join("import", metaData["d_type"], meta_id,
                             meta_id + ".header"),
            }],
        )