Ejemplo n.º 1
0
def resolve_node_entities(graph_specification,
                          spark_config,
                          entity_maps,
                          input_node_path,
                          output_node_path,
                          output_node_id=None,
                          data_format='parquet'):
    """Runner for resolve node entities task that sets up the SparkContext.

    :param graph_specification: Graph specification.
    :type graph_specification: fncore.utils.graph_specification.GraphSpec
    :param spark_config: Spark config.
    :type spark_config: fncore.utils.spark_tools.SparkConfFactory
    :param entity_maps: Canonical id mapping.
    :type entity_maps: dict
    :param input_node_path: Path to input node files for this graph.
    :type input_node_path: str
    :param output_node_path: Path to output node files for this graph.
    :type output_node_path: str
    :param output_node_id: Output column name. If it is set to `None`, the
    output will replace the mapped column.
    :type output_node_id: str
    :param data_format: Format to read and write files for this graph.
    :type data_format: str

    """
    with get_spark_context(spark_config.create()) as spark:
        _resolve_node_entities(graph_specification=graph_specification,
                               spark_context=spark,
                               entity_maps=entity_maps,
                               input_node_path=input_node_path,
                               output_node_path=output_node_path,
                               output_node_id=output_node_id,
                               data_format=data_format)
Ejemplo n.º 2
0
def build_edge_lists(graph_specification,
                     spark_config,
                     tables_path,
                     edge_path,
                     data_format='parquet'):
    """Runner for build edge lists task that sets up the SparkContext.

    :param graph_specification: Graph specification.
    :type graph_specification: fncore.utils.graph_specification.GraphSpec
    :param spark_config: Spark config.
    :type spark_config: fncore.utils.spark_tools.SparkConfFactory
    :param tables_path: Path to get table files for this graph.
    :type tables_path: str
    :param edge_path: Path to output edge files for this graph.
    :type edge_path: str
    :param data_format: Format to read and write files for this graph.
    :type data_format: str

    """
    with get_spark_context(spark_config.create()) as spark:
        _build_edge_lists(graph_specification=graph_specification,
                          spark_context=spark,
                          tables_path=tables_path,
                          edge_path=edge_path,
                          data_format=data_format)
Ejemplo n.º 3
0
def import_jdbc_table(spark_config,
                      uri,
                      input_table,
                      input_cols,
                      output_table,
                      output_cols,
                      driver,
                      data_format,
                      debug=False):
    """
    Reads a table from microsoft sql server and writes the relevant columns
    into a parquet file.

    :param spark_config: Spark config.
    :type spark_config: fncore.utils.spark_tools.SparkConfFactory
    :param uri: database connection string
    :type uri: str
    :param input_table: table to import
    :type input_table: str
    :param input_cols: list of columns to import
    :type input_cols: List[str]
    :param output_table: path to save the imported table
    :type output_table: str
    :param output_cols: columns names for saving the imported table
    :type output_cols: List[str]
    :param driver: jdbc driver to use
    :type driver: str
    :param data_format: Format to read and write files for this graph.
    :type data_format: str
    :param debug: flag to sample a small fraction of the data
    :type debug: bool
    """
    with get_spark_context(spark_config.create()) as spark_context:
        sql_context = SQLContext(spark_context)

        # Make use of pushdown optimization to read only columns needed
        # https://docs.databricks.com/spark/latest/data-sources/sql-databases.html
        df_table = (sql_context.read.jdbc(
            url=uri, table=input_table, properties={
                'driver': driver
            }).select(
                [col(c).alias(sc) for c, sc in zip(input_cols, output_cols)]))

        if debug:
            df_table = df_table.sample(False, 0.025)

        (df_table.write.format(data_format).mode(
            saveMode='overwrite').save(output_table))
Ejemplo n.º 4
0
def grey_list_results_writer(offset_day, grey_list_results_dir, jdbc_url,
                             driver, grey_list_table_name, spark_config,
                             execution_date, **kwargs):
    """
    Entry point of the airflow task: grey_list_csv_writer. Writes the results
    to a csv file in hdfs. kwargs contains additional information that
    airflow passes to the function.

    :param offset_day: number of days offset from the execution date
    :type offset_day: int
    :param grey_list_results_dir: path to the coi results for execution date
    :type grey_list_results_dir: str
    :param jdbc_url: url to the microsoft sql server
    :type jdbc_url: str
    :param driver: jdbc driver to use
    :type driver: str
    :param grey_list_table_name: name of table to save the results to
    :type grey_list_table_name: str
    :param spark_config: configurations for spark context
    :type spark_config: fncore.utils.spark_tools.SparkConfFactory
    :param execution_date: date of the vouchers for checking
    :type execution_date: datetime.datetime
    :return: None
    :rtype: None
    """
    execution_date = execution_date - datetime.timedelta(days=offset_day)
    with get_spark_context(spark_config.create()) as spark_context:
        sql_context = SQLContext(spark_context)

        grey_list_results_filename = ("grey_list_results_{}.parquet".format(
            execution_date.strftime('%Y_%m_%d')))
        grey_list_results_path = os.path.join(grey_list_results_dir,
                                              grey_list_results_filename)

        if check_file_exists_hdfs(grey_list_results_path):
            grey_list_results = sql_context.read.parquet(
                grey_list_results_path).cache()
            grey_list_results.count()
            grey_list_results.write.mode(saveMode='append').jdbc(
                url=jdbc_url,
                table=grey_list_table_name,
                properties={'driver': driver})
Ejemplo n.º 5
0
def write_neo4j_edges(graph_specification, spark_config):
    """
    Given the graph specification, spark and neo4j configurations, insert the
    edges in the data (specified in the graph specification) into the neo4j
    database. This is used as an airflow task

    :param graph_specification: graph specification in dictionary format
    :type graph_specification: dict
    :param spark_config: Spark config.
    :type spark_config: fncore.utils.spark_tools.SparkConfFactory
    :return: Does not return anything
    :rtype: None
    """
    # pylint: disable=too-many-locals
    data_format, graph_data_path = get_hdfs_info(graph_specification)

    # Use graph specification's neo4j connection
    neo_config = {
        'uri': graph_specification['graph_uri'],
        'max_retries': 5,
        'max_batchsize': 20000
    }

    with get_spark_context(spark_config.create()) as spark_ctx:
        sql_context = SQLContext(spark_ctx)

        # create and save edge lists
        edge_list = graph_specification.get('edge_lists')
        count = 0
        if edge_list:
            for edge_kind in edge_list:
                count += 1
                logging.info("# " +
                             str(count) +
                             "/" +
                             str(len(edge_list)) +
                             ": " +
                             edge_kind['safe_name'])

                # Load in the edge list with duplicates dropped
                data = sql_context\
                    .read.format(data_format)\
                    .load(os.path.join(graph_data_path['edge_list_resolved'],
                                       edge_kind['safe_name']))

                # Get the friendly name mapping
                mapping = get_friendly_name_mapping(edge_kind)

                # Get the hidden fields
                hiddenfields = get_fields_with_property(
                    edge_kind, prop='hidden')

                # Drops duplicates
                keyname = None
                keylist = ['_canonical_id_source', '_canonical_id_target']
                if 'index_column' in edge_kind:
                    keyname = edge_kind['index_column']\
                        .get('safe_name', None)
                    if keyname:
                        keylist.append(keyname)
                data = data.dropDuplicates(keylist)\
                           .dropna(how='any', subset=keylist)

                data = data.repartition(1000)
                logging.info("Count: " + str(data.count()))

                # Insert the edges into the Neo4j database
                tags = edge_kind['tags'] if 'tags' in edge_kind else 'related'
                data.foreachPartition(
                    lambda x, t=tags, key=keyname, m=mapping, h=hiddenfields, n=keylist:
                    push_edges(neo_config, t, key, x, m, h, n)
                )
Ejemplo n.º 6
0
def write_neo4j_nodes(graph_specification, spark_config):
    """
    Given the graph specification, spark and neo4j configurations, insert the
    nodes in the data (specified in the graph specification) into the neo4j
    database. This is used as an airflow task

    :param graph_specification: graph specification in dictionary format
    :type graph_specification: dict
    :param spark_config: Spark config.
    :type spark_config: fncore.utils.spark_tools.SparkConfFactory
    :return: Does not return anything
    :rtype: None
    """
    # pylint: disable=too-many-locals
    data_format, graph_data_path = get_hdfs_info(graph_specification)

    # Use graph specification's neo4j connection
    neo_config = {
        'uri': graph_specification['graph_uri'],
        'max_retries': 5,
        'max_batchsize': 20000
    }

    with get_spark_context(spark_config.create()) as spark_ctx:
        sql_context = SQLContext(spark_ctx)

        # create and save node lists
        node_list = graph_specification.get('node_lists')
        count = 0
        if node_list:
            for node_kind in node_list:
                count += 1
                logging.info(
                    "%d/%d: %s",
                    count,
                    len(node_list),
                    node_kind['safe_name']
                )

                # Load in the node list with duplicates dropped
                # and invalid entries
                data = sql_context\
                    .read.format(data_format)\
                    .option('header', 'true')\
                    .option('inferschema', 'true')\
                    .load(os.path.join(graph_data_path['node_list_resolved'],
                                       node_kind['safe_name']))\
                    .dropna(how='any', subset=['_canonical_id'])

                # Get the friendly name mapping
                mapping = get_friendly_name_mapping(node_kind)

                # Get the hidden fields
                hiddenfields = get_fields_with_property(
                    node_kind, prop='hidden')

                # Get the labelled fields
                labelledfields = get_fields_with_property(
                    node_kind, prop='use_as_label')
                indexfields = ['_label' if k == 0 else '_label_' + str(k)
                               for k in range(len(labelledfields))]

                labelledfields.append(
                    node_kind['index_column'].get('safe_name'))
                indexfields.append('_node_id')

                # Drop invalid data in the fields that need to be indexed
                data = data.dropna(how='any', subset=labelledfields)

                # Ignore node id and label fields
                noappendfields = indexfields + [labelledfields[-1]]

                # Update the data frame to have the labels
                for oldfield, newfield in zip(labelledfields, indexfields):
                    data = data.withColumn(newfield, upper(data[oldfield]))

                # Setup the node constraints and indices on the labels
                tags = node_kind['tags'] + ['_searchable']
                with get_neo4j_context(neo_config['uri']) as neo_ctx:
                    for tag in tags:
                        create_uniqueness_constraint(neo_ctx, tag, '_canonical_id')
                    already_indexed = get_indexes(neo_ctx, '_searchable')
                    for curindex in indexfields:
                        if curindex not in already_indexed:
                            create_index(neo_ctx, '_searchable', curindex)

                data.foreachPartition(
                    lambda x, t=tags, m=mapping, h=hiddenfields, n=noappendfields:
                    push_nodes(neo_config, t, x, m, h, n)
                )
Ejemplo n.º 7
0
def get_transformed_edges(graph_specification,
                          spark_config,
                          input_edge_path,
                          input_source_col,
                          input_target_col,
                          output_source_col,
                          output_target_col,
                          output_tag_col,
                          data_format='parquet',
                          array_delimiter=';',
                          max_result_size=1e9):
    """
    A generator that returns a Panda data frame of each processed edge
    in the graph specification

    :param graph_specification: Graph specification.
    :type graph_specification: fncore.utils.graph_specification.GraphSpec
    :param spark_config: Spark config.
    :type spark_config: fncore.utils.spark_tools.SparkConfFactory
    :param input_edge_path: Path to input edge files for this graph.
    :type input_edge_path: str
    :param output_source_col: Column name to use for source id.
    :type output_source_col: str
    :param output_target_col: Column name to use for target id.
    :type output_target_col: str
    :param output_tag_col: Column name to use for node tag.
    :type output_tag_col: str
    :param data_format: Format to read and write files for this graph.
    :type data_format: str
    :param array_delimiter: Delimiter used to separate items in array
    :type array_delimiter: str
    :param max_result_size: Maximum result size that spark driver accept
    :type max_result_size: int
    """

    for edge_kind in graph_specification.edge_lists:
        with get_spark_context(spark_config.create()) as spark_context:
            sql_context = SQLContext(spark_context)

            data = (sql_context.read.format(data_format).option(
                'header', 'true').option('inferschema', 'true').load(
                    os.path.join(input_edge_path, edge_kind.safe_name)))

            edge_kind_columns = (
                edge_kind.metadata_columns + [edge_kind.source_column] +
                [edge_kind.target_column] +
                ([edge_kind.index_column] if edge_kind.index_column else []) +
                ([edge_kind.weight_column] if edge_kind.weight_column else []))

            transformed = data

            # Drops duplicates (if index column does not exist)
            # TODO: Support multi field index in the future
            if not edge_kind.index_column:
                dedup_columns = ([edge_kind.source_column.safe_name] +
                                 [edge_kind.target_column.safe_name])
                transformed = transformed.dropDuplicates(subset=dedup_columns)

            for column in edge_kind_columns:
                transformed = transformed.withColumnRenamed(
                    column.safe_name, column.friendly_name or column.name)

            edge_tags = array_delimiter.join(edge_kind.tags)

            transformed = (transformed.withColumn(
                output_source_col,
                trim(transformed[input_source_col])).withColumn(
                    output_target_col,
                    trim(transformed[input_target_col])).withColumn(
                        output_tag_col, lit(edge_tags)))

            transformed = (transformed.dropna(
                how='any',
                subset=[output_source_col, output_target_col
                        ]).filter(transformed[output_source_col] != '').filter(
                            transformed[output_target_col] != ''))

            for dataframe in to_pandas_iterator(
                    transformed, max_result_size=max_result_size):
                yield dataframe
Ejemplo n.º 8
0
def get_combined_nodes(graph_specification,
                       spark_config,
                       input_node_path,
                       output_node_id_col,
                       output_label_col,
                       output_tag_col,
                       common_tags,
                       data_format='parquet',
                       array_delimiter=';',
                       max_result_size=1e9):
    """
    Return a Pandas data frame of the combined nodes

    :param graph_specification: Graph specification.
    :type graph_specification: fncore.utils.graph_specification.GraphSpec
    :param spark_config: Spark config.
    :type spark_config: fncore.utils.spark_tools.SparkConfFactory
    :param input_node_path: Path to input node files for this graph.
    :type input_node_path: str
    :param output_node_id_col: Column name to use for node id.
    :type output_node_id_col: str
    :param output_label_col: Column name to use for node label.
    :type output_label_col: str
    :param output_tag_col: Column name to use for node tag.
    :type output_tag_col: str
    :param common_tags: Common tags to append to all the nodes
    :type common_tags: array of str
    :param data_format: Format to read and write files for this graph.
    :type data_format: str
    :param array_delimiter: Delimiter used to separate items in array
    :type array_delimiter: str
    :param max_result_size: Maximum result size that spark driver accept
    :type max_result_size: int
    """

    with get_spark_context(spark_config.create()) as spark_context:
        transformed_node_lists = load_transform_nodes(
            graph_specification=graph_specification,
            spark_context=spark_context,
            input_node_path=input_node_path,
            output_node_id_col=output_node_id_col,
            output_label_col=output_label_col,
            output_tag_col=output_tag_col,
            data_format=data_format,
            array_delimiter=array_delimiter)

        nodes_formatted = (
            union_all(transformed_node_lists).groupby(output_node_id_col).agg(
                collect_set(output_label_col).alias(output_label_col),
                collect_set(output_tag_col).alias(output_tag_col)).withColumn(
                    output_tag_col,
                    prepend(common_tags)(output_tag_col)).withColumn(
                        output_label_col,
                        array_to_str(array_delimiter)(
                            output_label_col)).withColumn(
                                output_tag_col,
                                array_to_str(array_delimiter)(
                                    output_tag_col)).repartition(1000).cache())

        # Drop nodes with empty id (required field)
        nodes_dropped = (nodes_formatted.filter(
            nodes_formatted[output_node_id_col] != '').dropna(
                how='any', subset=[output_node_id_col]))

        # Return the dataframe in batches
        for dataframe in to_pandas_iterator(nodes_dropped,
                                            max_result_size=max_result_size):
            yield dataframe
Ejemplo n.º 9
0
def coi_detection(offset_day, graph_spec, node_resolved_dir, edge_resolved_dir,
                  results_dir, data_format, spark_config, max_path_len,
                  execution_date, **kwargs):
    """
    Entry point of the airflow task: coi_detection.
    kwargs contains additional information that airflow passes to the function.

    :param offset_day: number of days offset from the execution date
    :type offset_day: int
    :param graph_spec: the graph specification
    :type graph_spec: fncore.utils.graph_specification.GraphSpec
    :param node_resolved_dir: path to the resolved node lists
    :type node_resolved_dir: str
    :param edge_resolved_dir: path to the resolved edge lists
    :type edge_resolved_dir: str
    :param results_dir: path to write the final results
    :type results_dir: str
    :param data_format: data format used in the pipeline (set to `parquet`)
    :type data_format: str
    :param spark_config: configurations for spark context
    :type spark_config: fncore.utils.spark_tools.SparkConfFactory
    :param max_path_len: maximum length of path for bfs
    :type max_path_len: int
    :param execution_date: date of the vouchers for checking
    :type execution_date: datetime.datetime
    :return: None
    :rtype: None
    """
    execution_date = execution_date - datetime.timedelta(days=offset_day)
    transactions_date = str(execution_date.date())

    with get_spark_context(spark_config.create()) as spark_context:
        sql_context = SQLContext(spark_context)

        tables = load_node_edge_lists(sql_context, graph_spec,
                                      node_resolved_dir, edge_resolved_dir,
                                      data_format)

        # 1. Get relevant edges
        edges = load_agd_coi_edges(tables, graph_spec, transactions_date)
        edges = edges.cache()

        # 2. Get division vendor transactions for a given `transactions_date`
        division_vendor_transactions = division_vendor_transactions_table(
            tables, transactions_date, graph_spec)

        src_dst_df = (division_vendor_transactions.select(
            col('division').alias('start'),
            col('vendor').alias('end')))
        src_dst_df = src_dst_df.dropna().distinct().cache()

        # 3. Properly format the results from breadth first search
        # Only performs breadth first search if src_dst_df is non-empty
        if src_dst_df.count() > 0:
            list_coi_results = list()
            for coi_result in bi_direction_bfs_fixed(
                    edges, src_dst_df, max_path_len=max_path_len):
                list_coi_results.append(
                    _process_coi_viz(coi_result, transactions_date,
                                     division_vendor_transactions))

            if list_coi_results:
                coi_results = functools.reduce(DataFrame.unionAll,
                                               list_coi_results)

                # 4. Write the results to parquet
                results_filename = ("coi_results_{}.parquet".format(
                    execution_date.strftime('%Y_%m_%d')))
                results_path = os.path.join(results_dir, results_filename)
                coi_results.write.parquet(results_path, mode='overwrite')
Ejemplo n.º 10
0
def grey_list_detection(offset_day, graph_spec, node_resolved_dir,
                        edge_resolved_dir, results_dir, data_format,
                        spark_config, max_path_len, execution_date, **kwargs):
    """
    Entry point of the airflow task: grey_list_detection.
    kwargs contains additional information that airflow passes in.

    :param offset_day: number of days offset from the execution date
    :type offset_day: int
    :param graph_spec: the graph specification
    :type graph_spec: fncore.utils.graph_specification.GraphSpec
    :param node_resolved_dir: path to the resolved node lists
    :type node_resolved_dir: str
    :param edge_resolved_dir: path to the resolved edge lists
    :type edge_resolved_dir: str
    :param results_dir: path to write the final results
    :type results_dir: str
    :param data_format: data format used in the pipeline (set to `parquet`)
    :type data_format: str
    :param spark_config: configurations for spark context
    :type spark_config: fncore.utils.spark_tools.SparkConfFactory
    :param max_path_len: maximum length of path for bfs
    :type max_path_len: int
    :param execution_date: date of the vouchers for checking
    :type execution_date: datetime.datetime
    :return: None
    :rtype: None
    """
    execution_date = execution_date - datetime.timedelta(days=offset_day)
    transactions_date = str(execution_date.date())

    with get_spark_context(spark_config.create()) as spark_context:
        sql_context = SQLContext(spark_context)

        tables = load_node_edge_lists(sql_context, graph_spec,
                                      node_resolved_dir, edge_resolved_dir,
                                      data_format)

        # 1. Get relevant edges
        edges = load_agd_greylist_edges(tables, graph_spec, transactions_date)
        edges = edges.cache()

        # 2. Get vendor subbusiness transactions for a `transactions_date`
        # as well as the debarred vendors for the `transactions_date`
        debarred_vendor = debarred_vendor_table(tables, transactions_date,
                                                graph_spec)
        transacted_vendor = division_vendor_transactions_table(
            tables, transactions_date, graph_spec)

        src_df = (transacted_vendor.select(
            col('vendor').alias('start')).distinct().cache())
        dst_df = (debarred_vendor.select(
            col('debarred_vendor').alias('end')).distinct().cache())

        # 3. Properly format the results from breadth first search
        if src_df.count() > 0 and dst_df.count() > 0:
            list_grey_list_results = []

            one_hop = _get_direct_transaction(transactions_date,
                                              transacted_vendor,
                                              debarred_vendor)

            if one_hop.take(1):
                list_grey_list_results.append(one_hop)

            for grey_list_result in bi_direction_bfs_any(
                    edges, src_df, dst_df, max_path_len=max_path_len):

                list_grey_list_results.append(
                    _process_grey_list_viz(grey_list_result, transactions_date,
                                           transacted_vendor, debarred_vendor))

            if list_grey_list_results:
                grey_list_results = (functools.reduce(DataFrame.unionAll,
                                                      list_grey_list_results))

                # 4. Write the results to parquet
                results_filename = ("grey_list_results_{}.parquet".format(
                    execution_date.strftime('%Y_%m_%d')))
                results_path = os.path.join(results_dir, results_filename)
                grey_list_results.write.parquet(results_path, mode='overwrite')
Ejemplo n.º 11
0
def test_pipeline_tasks():
    """
    Test the pipeline
    """

    # Set the list of tasks to test
    dolist = [
        'build_lists', 'resolve_entities',
        'neo4j_purger', 'neo4j_writer',
        'graph_tools'
    ]

    # Get neo4j ssh username and port
    neo4j_ssh_username = os.environ.get('NEO4J_SSH_USERNAME', 'neo4j')
    neo4j_ssh_port = int(os.environ.get('NEO4J_SSH_PORT', 9000))

    # Setup the spark configuration
    config = dict()
    config['SparkConfiguration'] = (SparkConf()
                                    .setMaster('local[*]')
                                    .setAppName("test create data")
                                    .set("spark.executor.memory", "1024m"))

    # Get the graph specs
    datalist = os.listdir(LOCAL_DATA_PATH)
    jsonlist = [k for k in datalist if re.match(r'.*\.json$', k)]

    # Read in the graph spec
    for gspec in jsonlist:
        # Load the graph spec
        with open(os.path.join(LOCAL_DATA_PATH, gspec), 'r') as f:
            graph_spec = GraphSpec.from_dict(json.load(f))
            spec = graph_spec.to_dict()

        tables_path = os.path.join(DATA_PATH, graph_spec.name, 'tables')
        n_path = os.path.join(DATA_PATH, graph_spec.name, 'node_list')
        e_path = os.path.join(DATA_PATH, graph_spec.name, 'edge_list')
        n_path_res = os.path.join(DATA_PATH, graph_spec.name, 'node_list_resolved')
        e_path_res = os.path.join(DATA_PATH, graph_spec.name, 'edge_list_resolved')

        logging.info("Processing " + gspec)

        # Use graph specification's neo4j connection
        neo_config = {
            'uri': spec['graph_uri'],
            'max_retries': config.get('neo4j.max_retries', 5),
            'max_batchsize': config.get('neo4j.max_batchsize', 10000)
        }

        # Build list
        if 'build_lists' in dolist:
            logging.info("Building lists...")
            build_node_lists(
                graph_specification=graph_spec,
                spark_config=(SparkConfFactory()
                              .set_master('local[*]')
                              .set_app_name('test create data')
                              .set('spark.executor.memory', '1g')),
                tables_path=tables_path,
                node_path=n_path,
                data_format=DATA_FORMAT,
            )
            build_edge_lists(
                graph_specification=graph_spec,
                spark_config=(SparkConfFactory()
                              .set_master('local[*]')
                              .set_app_name('test create data')
                              .set('spark.executor.memory', '1g')),
                tables_path=tables_path,
                edge_path=e_path,
                data_format=DATA_FORMAT,
            )
            logging.info("Checking build_lists...")
            with get_spark_context(config['SparkConfiguration']) as spark_ctx:
                sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
                assert test_build_lists(spark_ctx, sql_context, spec)

        # Resolve entities
        if 'resolve_entities' in dolist:
            logging.info("Resolving entities...")
            resolve_node_entities(
                graph_specification=graph_spec,
                spark_config=(SparkConfFactory()
                              .set_master('local[*]')
                              .set_app_name('test create data')
                              .set('spark.executor.memory', '1g')),
                entity_maps=dict(),
                input_node_path=n_path,
                output_node_path=n_path_res,
                output_node_id='_canonical_id',
                data_format=DATA_FORMAT
            )
            resolve_edge_entities(
                graph_specification=graph_spec,
                spark_config=(SparkConfFactory()
                              .set_master('local[*]')
                              .set_app_name('test create data')
                              .set('spark.executor.memory', '1g')),
                entity_maps=dict(),
                input_edge_path=e_path,
                output_edge_path=e_path_res,
                output_edge_source_id='_canonical_id_source',
                output_edge_target_id='_canonical_id_target',
                data_format=DATA_FORMAT
            )

        # Purging the graph
        if 'neo4j_purger' in dolist:
            logging.info("Purging Neo4j...")
            neo4j_manager.purge(graph_spec,
                                username=neo4j_ssh_username,
                                port=neo4j_ssh_port)
            logging.info("Checking purging neo4j...")
            with get_neo4j_context(neo_config['uri']) as neo_context:
                assert test_neo4j_purger(neo_context)

        # Graph writer
        if 'neo4j_writer' in dolist:
            logging.info("Writing to Neo4j...")
            graph_to_neo4j.graph_to_neo4j(graph_specification=graph_spec,
                                          spark_config=SparkConfFactory()
                                          .set_master('local[*]')
                                          .set_app_name('write neo4j nodes')
                                          .set("spark.driver.maxResultSize",
                                               "1g")
                                          .set('spark.executor.memory',
                                               '1g'),
                                          input_node_path=n_path_res,
                                          input_edge_path=e_path_res,
                                          username=neo4j_ssh_username,
                                          port=neo4j_ssh_port
                                          )

            # This inserts node properties that were not captured above, more convenient like this???
            neo4j_writer.write_neo4j_nodes(graph_specification=spec,
                                           spark_config=SparkConfFactory()
                                           .set_master('local[*]')
                                           .set_app_name('write neo4j nodes')
                                           .set('spark.executor.memory',
                                                '1g')
                                           )

            datetime_now = datetime.now()
            logging.info("Backing up db, then purge it...")
            neo4j_manager.backup(graph_spec, datetime_now,
                                 username=neo4j_ssh_username,
                                 port=neo4j_ssh_port)
            neo4j_manager.purge(graph_spec,
                                username=neo4j_ssh_username,
                                port=neo4j_ssh_port)
            logging.info("Restoring the backup to db...")
            neo4j_manager.restore(graph_spec,
                                  datetime_now,
                                  username=neo4j_ssh_username,
                                  port=neo4j_ssh_port)

            logging.info("Checking write neo4j...")
            with get_spark_context(config['SparkConfiguration']) as spark_ctx:
                sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
                with get_neo4j_context(neo_config['uri']) as neo_context:
                    assert test_neo4j_writer(
                        spark_ctx, sql_context, neo_context, spec
                    )

        if 'graph_tools' in dolist:
            # Test graph_construction_coi.get_graph_dataframes
            data_path = os.environ['PIPELINE_DATA_PATH']
            graph_name = graph_spec.name
            node_path_resolved = os.path.join(data_path, graph_name, 'node_list_resolved')
            edge_path_resolved = os.path.join(data_path, graph_name, 'edge_list_resolved')
            with get_spark_context(config['SparkConfiguration']) as spark_ctx:
                sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
                graph = get_graph_dataframes(graph_spec, sql_context,
                                             node_path_resolved, edge_path_resolved,
                                             DATA_FORMAT)

                assert 'node_list' in graph
                assert 'edge_list' in graph
                assert len(graph['node_list']) == len(graph_spec.node_lists)
                for cur_node_list in graph_spec.node_lists:
                    assert cur_node_list.safe_name in graph['node_list']
                assert len(graph['edge_list']) == len(graph_spec.edge_lists)
                for cur_edge_list in graph_spec.edge_lists:
                    assert cur_edge_list.safe_name in graph['edge_list']

            # Test graph_construction_coi.data_loading
            with get_spark_context(config['SparkConfiguration']) as spark_ctx:
                sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx))
                tables = load_node_edge_lists(sql_context, graph_spec,
                                              node_path_resolved, edge_path_resolved,
                                              DATA_FORMAT)
                for cur_edge_list in graph_spec.edge_lists:
                    assert (cur_edge_list.safe_table_name,
                            cur_edge_list.source_column.safe_name,
                            cur_edge_list.target_column.safe_name) in tables
                assert len(tables) == len(graph_spec.node_lists) + len(graph_spec.edge_lists)
    logging.info("Completed run_tests()")