def resolve_node_entities(graph_specification, spark_config, entity_maps, input_node_path, output_node_path, output_node_id=None, data_format='parquet'): """Runner for resolve node entities task that sets up the SparkContext. :param graph_specification: Graph specification. :type graph_specification: fncore.utils.graph_specification.GraphSpec :param spark_config: Spark config. :type spark_config: fncore.utils.spark_tools.SparkConfFactory :param entity_maps: Canonical id mapping. :type entity_maps: dict :param input_node_path: Path to input node files for this graph. :type input_node_path: str :param output_node_path: Path to output node files for this graph. :type output_node_path: str :param output_node_id: Output column name. If it is set to `None`, the output will replace the mapped column. :type output_node_id: str :param data_format: Format to read and write files for this graph. :type data_format: str """ with get_spark_context(spark_config.create()) as spark: _resolve_node_entities(graph_specification=graph_specification, spark_context=spark, entity_maps=entity_maps, input_node_path=input_node_path, output_node_path=output_node_path, output_node_id=output_node_id, data_format=data_format)
def build_edge_lists(graph_specification, spark_config, tables_path, edge_path, data_format='parquet'): """Runner for build edge lists task that sets up the SparkContext. :param graph_specification: Graph specification. :type graph_specification: fncore.utils.graph_specification.GraphSpec :param spark_config: Spark config. :type spark_config: fncore.utils.spark_tools.SparkConfFactory :param tables_path: Path to get table files for this graph. :type tables_path: str :param edge_path: Path to output edge files for this graph. :type edge_path: str :param data_format: Format to read and write files for this graph. :type data_format: str """ with get_spark_context(spark_config.create()) as spark: _build_edge_lists(graph_specification=graph_specification, spark_context=spark, tables_path=tables_path, edge_path=edge_path, data_format=data_format)
def import_jdbc_table(spark_config, uri, input_table, input_cols, output_table, output_cols, driver, data_format, debug=False): """ Reads a table from microsoft sql server and writes the relevant columns into a parquet file. :param spark_config: Spark config. :type spark_config: fncore.utils.spark_tools.SparkConfFactory :param uri: database connection string :type uri: str :param input_table: table to import :type input_table: str :param input_cols: list of columns to import :type input_cols: List[str] :param output_table: path to save the imported table :type output_table: str :param output_cols: columns names for saving the imported table :type output_cols: List[str] :param driver: jdbc driver to use :type driver: str :param data_format: Format to read and write files for this graph. :type data_format: str :param debug: flag to sample a small fraction of the data :type debug: bool """ with get_spark_context(spark_config.create()) as spark_context: sql_context = SQLContext(spark_context) # Make use of pushdown optimization to read only columns needed # https://docs.databricks.com/spark/latest/data-sources/sql-databases.html df_table = (sql_context.read.jdbc( url=uri, table=input_table, properties={ 'driver': driver }).select( [col(c).alias(sc) for c, sc in zip(input_cols, output_cols)])) if debug: df_table = df_table.sample(False, 0.025) (df_table.write.format(data_format).mode( saveMode='overwrite').save(output_table))
def grey_list_results_writer(offset_day, grey_list_results_dir, jdbc_url, driver, grey_list_table_name, spark_config, execution_date, **kwargs): """ Entry point of the airflow task: grey_list_csv_writer. Writes the results to a csv file in hdfs. kwargs contains additional information that airflow passes to the function. :param offset_day: number of days offset from the execution date :type offset_day: int :param grey_list_results_dir: path to the coi results for execution date :type grey_list_results_dir: str :param jdbc_url: url to the microsoft sql server :type jdbc_url: str :param driver: jdbc driver to use :type driver: str :param grey_list_table_name: name of table to save the results to :type grey_list_table_name: str :param spark_config: configurations for spark context :type spark_config: fncore.utils.spark_tools.SparkConfFactory :param execution_date: date of the vouchers for checking :type execution_date: datetime.datetime :return: None :rtype: None """ execution_date = execution_date - datetime.timedelta(days=offset_day) with get_spark_context(spark_config.create()) as spark_context: sql_context = SQLContext(spark_context) grey_list_results_filename = ("grey_list_results_{}.parquet".format( execution_date.strftime('%Y_%m_%d'))) grey_list_results_path = os.path.join(grey_list_results_dir, grey_list_results_filename) if check_file_exists_hdfs(grey_list_results_path): grey_list_results = sql_context.read.parquet( grey_list_results_path).cache() grey_list_results.count() grey_list_results.write.mode(saveMode='append').jdbc( url=jdbc_url, table=grey_list_table_name, properties={'driver': driver})
def write_neo4j_edges(graph_specification, spark_config): """ Given the graph specification, spark and neo4j configurations, insert the edges in the data (specified in the graph specification) into the neo4j database. This is used as an airflow task :param graph_specification: graph specification in dictionary format :type graph_specification: dict :param spark_config: Spark config. :type spark_config: fncore.utils.spark_tools.SparkConfFactory :return: Does not return anything :rtype: None """ # pylint: disable=too-many-locals data_format, graph_data_path = get_hdfs_info(graph_specification) # Use graph specification's neo4j connection neo_config = { 'uri': graph_specification['graph_uri'], 'max_retries': 5, 'max_batchsize': 20000 } with get_spark_context(spark_config.create()) as spark_ctx: sql_context = SQLContext(spark_ctx) # create and save edge lists edge_list = graph_specification.get('edge_lists') count = 0 if edge_list: for edge_kind in edge_list: count += 1 logging.info("# " + str(count) + "/" + str(len(edge_list)) + ": " + edge_kind['safe_name']) # Load in the edge list with duplicates dropped data = sql_context\ .read.format(data_format)\ .load(os.path.join(graph_data_path['edge_list_resolved'], edge_kind['safe_name'])) # Get the friendly name mapping mapping = get_friendly_name_mapping(edge_kind) # Get the hidden fields hiddenfields = get_fields_with_property( edge_kind, prop='hidden') # Drops duplicates keyname = None keylist = ['_canonical_id_source', '_canonical_id_target'] if 'index_column' in edge_kind: keyname = edge_kind['index_column']\ .get('safe_name', None) if keyname: keylist.append(keyname) data = data.dropDuplicates(keylist)\ .dropna(how='any', subset=keylist) data = data.repartition(1000) logging.info("Count: " + str(data.count())) # Insert the edges into the Neo4j database tags = edge_kind['tags'] if 'tags' in edge_kind else 'related' data.foreachPartition( lambda x, t=tags, key=keyname, m=mapping, h=hiddenfields, n=keylist: push_edges(neo_config, t, key, x, m, h, n) )
def write_neo4j_nodes(graph_specification, spark_config): """ Given the graph specification, spark and neo4j configurations, insert the nodes in the data (specified in the graph specification) into the neo4j database. This is used as an airflow task :param graph_specification: graph specification in dictionary format :type graph_specification: dict :param spark_config: Spark config. :type spark_config: fncore.utils.spark_tools.SparkConfFactory :return: Does not return anything :rtype: None """ # pylint: disable=too-many-locals data_format, graph_data_path = get_hdfs_info(graph_specification) # Use graph specification's neo4j connection neo_config = { 'uri': graph_specification['graph_uri'], 'max_retries': 5, 'max_batchsize': 20000 } with get_spark_context(spark_config.create()) as spark_ctx: sql_context = SQLContext(spark_ctx) # create and save node lists node_list = graph_specification.get('node_lists') count = 0 if node_list: for node_kind in node_list: count += 1 logging.info( "%d/%d: %s", count, len(node_list), node_kind['safe_name'] ) # Load in the node list with duplicates dropped # and invalid entries data = sql_context\ .read.format(data_format)\ .option('header', 'true')\ .option('inferschema', 'true')\ .load(os.path.join(graph_data_path['node_list_resolved'], node_kind['safe_name']))\ .dropna(how='any', subset=['_canonical_id']) # Get the friendly name mapping mapping = get_friendly_name_mapping(node_kind) # Get the hidden fields hiddenfields = get_fields_with_property( node_kind, prop='hidden') # Get the labelled fields labelledfields = get_fields_with_property( node_kind, prop='use_as_label') indexfields = ['_label' if k == 0 else '_label_' + str(k) for k in range(len(labelledfields))] labelledfields.append( node_kind['index_column'].get('safe_name')) indexfields.append('_node_id') # Drop invalid data in the fields that need to be indexed data = data.dropna(how='any', subset=labelledfields) # Ignore node id and label fields noappendfields = indexfields + [labelledfields[-1]] # Update the data frame to have the labels for oldfield, newfield in zip(labelledfields, indexfields): data = data.withColumn(newfield, upper(data[oldfield])) # Setup the node constraints and indices on the labels tags = node_kind['tags'] + ['_searchable'] with get_neo4j_context(neo_config['uri']) as neo_ctx: for tag in tags: create_uniqueness_constraint(neo_ctx, tag, '_canonical_id') already_indexed = get_indexes(neo_ctx, '_searchable') for curindex in indexfields: if curindex not in already_indexed: create_index(neo_ctx, '_searchable', curindex) data.foreachPartition( lambda x, t=tags, m=mapping, h=hiddenfields, n=noappendfields: push_nodes(neo_config, t, x, m, h, n) )
def get_transformed_edges(graph_specification, spark_config, input_edge_path, input_source_col, input_target_col, output_source_col, output_target_col, output_tag_col, data_format='parquet', array_delimiter=';', max_result_size=1e9): """ A generator that returns a Panda data frame of each processed edge in the graph specification :param graph_specification: Graph specification. :type graph_specification: fncore.utils.graph_specification.GraphSpec :param spark_config: Spark config. :type spark_config: fncore.utils.spark_tools.SparkConfFactory :param input_edge_path: Path to input edge files for this graph. :type input_edge_path: str :param output_source_col: Column name to use for source id. :type output_source_col: str :param output_target_col: Column name to use for target id. :type output_target_col: str :param output_tag_col: Column name to use for node tag. :type output_tag_col: str :param data_format: Format to read and write files for this graph. :type data_format: str :param array_delimiter: Delimiter used to separate items in array :type array_delimiter: str :param max_result_size: Maximum result size that spark driver accept :type max_result_size: int """ for edge_kind in graph_specification.edge_lists: with get_spark_context(spark_config.create()) as spark_context: sql_context = SQLContext(spark_context) data = (sql_context.read.format(data_format).option( 'header', 'true').option('inferschema', 'true').load( os.path.join(input_edge_path, edge_kind.safe_name))) edge_kind_columns = ( edge_kind.metadata_columns + [edge_kind.source_column] + [edge_kind.target_column] + ([edge_kind.index_column] if edge_kind.index_column else []) + ([edge_kind.weight_column] if edge_kind.weight_column else [])) transformed = data # Drops duplicates (if index column does not exist) # TODO: Support multi field index in the future if not edge_kind.index_column: dedup_columns = ([edge_kind.source_column.safe_name] + [edge_kind.target_column.safe_name]) transformed = transformed.dropDuplicates(subset=dedup_columns) for column in edge_kind_columns: transformed = transformed.withColumnRenamed( column.safe_name, column.friendly_name or column.name) edge_tags = array_delimiter.join(edge_kind.tags) transformed = (transformed.withColumn( output_source_col, trim(transformed[input_source_col])).withColumn( output_target_col, trim(transformed[input_target_col])).withColumn( output_tag_col, lit(edge_tags))) transformed = (transformed.dropna( how='any', subset=[output_source_col, output_target_col ]).filter(transformed[output_source_col] != '').filter( transformed[output_target_col] != '')) for dataframe in to_pandas_iterator( transformed, max_result_size=max_result_size): yield dataframe
def get_combined_nodes(graph_specification, spark_config, input_node_path, output_node_id_col, output_label_col, output_tag_col, common_tags, data_format='parquet', array_delimiter=';', max_result_size=1e9): """ Return a Pandas data frame of the combined nodes :param graph_specification: Graph specification. :type graph_specification: fncore.utils.graph_specification.GraphSpec :param spark_config: Spark config. :type spark_config: fncore.utils.spark_tools.SparkConfFactory :param input_node_path: Path to input node files for this graph. :type input_node_path: str :param output_node_id_col: Column name to use for node id. :type output_node_id_col: str :param output_label_col: Column name to use for node label. :type output_label_col: str :param output_tag_col: Column name to use for node tag. :type output_tag_col: str :param common_tags: Common tags to append to all the nodes :type common_tags: array of str :param data_format: Format to read and write files for this graph. :type data_format: str :param array_delimiter: Delimiter used to separate items in array :type array_delimiter: str :param max_result_size: Maximum result size that spark driver accept :type max_result_size: int """ with get_spark_context(spark_config.create()) as spark_context: transformed_node_lists = load_transform_nodes( graph_specification=graph_specification, spark_context=spark_context, input_node_path=input_node_path, output_node_id_col=output_node_id_col, output_label_col=output_label_col, output_tag_col=output_tag_col, data_format=data_format, array_delimiter=array_delimiter) nodes_formatted = ( union_all(transformed_node_lists).groupby(output_node_id_col).agg( collect_set(output_label_col).alias(output_label_col), collect_set(output_tag_col).alias(output_tag_col)).withColumn( output_tag_col, prepend(common_tags)(output_tag_col)).withColumn( output_label_col, array_to_str(array_delimiter)( output_label_col)).withColumn( output_tag_col, array_to_str(array_delimiter)( output_tag_col)).repartition(1000).cache()) # Drop nodes with empty id (required field) nodes_dropped = (nodes_formatted.filter( nodes_formatted[output_node_id_col] != '').dropna( how='any', subset=[output_node_id_col])) # Return the dataframe in batches for dataframe in to_pandas_iterator(nodes_dropped, max_result_size=max_result_size): yield dataframe
def coi_detection(offset_day, graph_spec, node_resolved_dir, edge_resolved_dir, results_dir, data_format, spark_config, max_path_len, execution_date, **kwargs): """ Entry point of the airflow task: coi_detection. kwargs contains additional information that airflow passes to the function. :param offset_day: number of days offset from the execution date :type offset_day: int :param graph_spec: the graph specification :type graph_spec: fncore.utils.graph_specification.GraphSpec :param node_resolved_dir: path to the resolved node lists :type node_resolved_dir: str :param edge_resolved_dir: path to the resolved edge lists :type edge_resolved_dir: str :param results_dir: path to write the final results :type results_dir: str :param data_format: data format used in the pipeline (set to `parquet`) :type data_format: str :param spark_config: configurations for spark context :type spark_config: fncore.utils.spark_tools.SparkConfFactory :param max_path_len: maximum length of path for bfs :type max_path_len: int :param execution_date: date of the vouchers for checking :type execution_date: datetime.datetime :return: None :rtype: None """ execution_date = execution_date - datetime.timedelta(days=offset_day) transactions_date = str(execution_date.date()) with get_spark_context(spark_config.create()) as spark_context: sql_context = SQLContext(spark_context) tables = load_node_edge_lists(sql_context, graph_spec, node_resolved_dir, edge_resolved_dir, data_format) # 1. Get relevant edges edges = load_agd_coi_edges(tables, graph_spec, transactions_date) edges = edges.cache() # 2. Get division vendor transactions for a given `transactions_date` division_vendor_transactions = division_vendor_transactions_table( tables, transactions_date, graph_spec) src_dst_df = (division_vendor_transactions.select( col('division').alias('start'), col('vendor').alias('end'))) src_dst_df = src_dst_df.dropna().distinct().cache() # 3. Properly format the results from breadth first search # Only performs breadth first search if src_dst_df is non-empty if src_dst_df.count() > 0: list_coi_results = list() for coi_result in bi_direction_bfs_fixed( edges, src_dst_df, max_path_len=max_path_len): list_coi_results.append( _process_coi_viz(coi_result, transactions_date, division_vendor_transactions)) if list_coi_results: coi_results = functools.reduce(DataFrame.unionAll, list_coi_results) # 4. Write the results to parquet results_filename = ("coi_results_{}.parquet".format( execution_date.strftime('%Y_%m_%d'))) results_path = os.path.join(results_dir, results_filename) coi_results.write.parquet(results_path, mode='overwrite')
def grey_list_detection(offset_day, graph_spec, node_resolved_dir, edge_resolved_dir, results_dir, data_format, spark_config, max_path_len, execution_date, **kwargs): """ Entry point of the airflow task: grey_list_detection. kwargs contains additional information that airflow passes in. :param offset_day: number of days offset from the execution date :type offset_day: int :param graph_spec: the graph specification :type graph_spec: fncore.utils.graph_specification.GraphSpec :param node_resolved_dir: path to the resolved node lists :type node_resolved_dir: str :param edge_resolved_dir: path to the resolved edge lists :type edge_resolved_dir: str :param results_dir: path to write the final results :type results_dir: str :param data_format: data format used in the pipeline (set to `parquet`) :type data_format: str :param spark_config: configurations for spark context :type spark_config: fncore.utils.spark_tools.SparkConfFactory :param max_path_len: maximum length of path for bfs :type max_path_len: int :param execution_date: date of the vouchers for checking :type execution_date: datetime.datetime :return: None :rtype: None """ execution_date = execution_date - datetime.timedelta(days=offset_day) transactions_date = str(execution_date.date()) with get_spark_context(spark_config.create()) as spark_context: sql_context = SQLContext(spark_context) tables = load_node_edge_lists(sql_context, graph_spec, node_resolved_dir, edge_resolved_dir, data_format) # 1. Get relevant edges edges = load_agd_greylist_edges(tables, graph_spec, transactions_date) edges = edges.cache() # 2. Get vendor subbusiness transactions for a `transactions_date` # as well as the debarred vendors for the `transactions_date` debarred_vendor = debarred_vendor_table(tables, transactions_date, graph_spec) transacted_vendor = division_vendor_transactions_table( tables, transactions_date, graph_spec) src_df = (transacted_vendor.select( col('vendor').alias('start')).distinct().cache()) dst_df = (debarred_vendor.select( col('debarred_vendor').alias('end')).distinct().cache()) # 3. Properly format the results from breadth first search if src_df.count() > 0 and dst_df.count() > 0: list_grey_list_results = [] one_hop = _get_direct_transaction(transactions_date, transacted_vendor, debarred_vendor) if one_hop.take(1): list_grey_list_results.append(one_hop) for grey_list_result in bi_direction_bfs_any( edges, src_df, dst_df, max_path_len=max_path_len): list_grey_list_results.append( _process_grey_list_viz(grey_list_result, transactions_date, transacted_vendor, debarred_vendor)) if list_grey_list_results: grey_list_results = (functools.reduce(DataFrame.unionAll, list_grey_list_results)) # 4. Write the results to parquet results_filename = ("grey_list_results_{}.parquet".format( execution_date.strftime('%Y_%m_%d'))) results_path = os.path.join(results_dir, results_filename) grey_list_results.write.parquet(results_path, mode='overwrite')
def test_pipeline_tasks(): """ Test the pipeline """ # Set the list of tasks to test dolist = [ 'build_lists', 'resolve_entities', 'neo4j_purger', 'neo4j_writer', 'graph_tools' ] # Get neo4j ssh username and port neo4j_ssh_username = os.environ.get('NEO4J_SSH_USERNAME', 'neo4j') neo4j_ssh_port = int(os.environ.get('NEO4J_SSH_PORT', 9000)) # Setup the spark configuration config = dict() config['SparkConfiguration'] = (SparkConf() .setMaster('local[*]') .setAppName("test create data") .set("spark.executor.memory", "1024m")) # Get the graph specs datalist = os.listdir(LOCAL_DATA_PATH) jsonlist = [k for k in datalist if re.match(r'.*\.json$', k)] # Read in the graph spec for gspec in jsonlist: # Load the graph spec with open(os.path.join(LOCAL_DATA_PATH, gspec), 'r') as f: graph_spec = GraphSpec.from_dict(json.load(f)) spec = graph_spec.to_dict() tables_path = os.path.join(DATA_PATH, graph_spec.name, 'tables') n_path = os.path.join(DATA_PATH, graph_spec.name, 'node_list') e_path = os.path.join(DATA_PATH, graph_spec.name, 'edge_list') n_path_res = os.path.join(DATA_PATH, graph_spec.name, 'node_list_resolved') e_path_res = os.path.join(DATA_PATH, graph_spec.name, 'edge_list_resolved') logging.info("Processing " + gspec) # Use graph specification's neo4j connection neo_config = { 'uri': spec['graph_uri'], 'max_retries': config.get('neo4j.max_retries', 5), 'max_batchsize': config.get('neo4j.max_batchsize', 10000) } # Build list if 'build_lists' in dolist: logging.info("Building lists...") build_node_lists( graph_specification=graph_spec, spark_config=(SparkConfFactory() .set_master('local[*]') .set_app_name('test create data') .set('spark.executor.memory', '1g')), tables_path=tables_path, node_path=n_path, data_format=DATA_FORMAT, ) build_edge_lists( graph_specification=graph_spec, spark_config=(SparkConfFactory() .set_master('local[*]') .set_app_name('test create data') .set('spark.executor.memory', '1g')), tables_path=tables_path, edge_path=e_path, data_format=DATA_FORMAT, ) logging.info("Checking build_lists...") with get_spark_context(config['SparkConfiguration']) as spark_ctx: sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx)) assert test_build_lists(spark_ctx, sql_context, spec) # Resolve entities if 'resolve_entities' in dolist: logging.info("Resolving entities...") resolve_node_entities( graph_specification=graph_spec, spark_config=(SparkConfFactory() .set_master('local[*]') .set_app_name('test create data') .set('spark.executor.memory', '1g')), entity_maps=dict(), input_node_path=n_path, output_node_path=n_path_res, output_node_id='_canonical_id', data_format=DATA_FORMAT ) resolve_edge_entities( graph_specification=graph_spec, spark_config=(SparkConfFactory() .set_master('local[*]') .set_app_name('test create data') .set('spark.executor.memory', '1g')), entity_maps=dict(), input_edge_path=e_path, output_edge_path=e_path_res, output_edge_source_id='_canonical_id_source', output_edge_target_id='_canonical_id_target', data_format=DATA_FORMAT ) # Purging the graph if 'neo4j_purger' in dolist: logging.info("Purging Neo4j...") neo4j_manager.purge(graph_spec, username=neo4j_ssh_username, port=neo4j_ssh_port) logging.info("Checking purging neo4j...") with get_neo4j_context(neo_config['uri']) as neo_context: assert test_neo4j_purger(neo_context) # Graph writer if 'neo4j_writer' in dolist: logging.info("Writing to Neo4j...") graph_to_neo4j.graph_to_neo4j(graph_specification=graph_spec, spark_config=SparkConfFactory() .set_master('local[*]') .set_app_name('write neo4j nodes') .set("spark.driver.maxResultSize", "1g") .set('spark.executor.memory', '1g'), input_node_path=n_path_res, input_edge_path=e_path_res, username=neo4j_ssh_username, port=neo4j_ssh_port ) # This inserts node properties that were not captured above, more convenient like this??? neo4j_writer.write_neo4j_nodes(graph_specification=spec, spark_config=SparkConfFactory() .set_master('local[*]') .set_app_name('write neo4j nodes') .set('spark.executor.memory', '1g') ) datetime_now = datetime.now() logging.info("Backing up db, then purge it...") neo4j_manager.backup(graph_spec, datetime_now, username=neo4j_ssh_username, port=neo4j_ssh_port) neo4j_manager.purge(graph_spec, username=neo4j_ssh_username, port=neo4j_ssh_port) logging.info("Restoring the backup to db...") neo4j_manager.restore(graph_spec, datetime_now, username=neo4j_ssh_username, port=neo4j_ssh_port) logging.info("Checking write neo4j...") with get_spark_context(config['SparkConfiguration']) as spark_ctx: sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx)) with get_neo4j_context(neo_config['uri']) as neo_context: assert test_neo4j_writer( spark_ctx, sql_context, neo_context, spec ) if 'graph_tools' in dolist: # Test graph_construction_coi.get_graph_dataframes data_path = os.environ['PIPELINE_DATA_PATH'] graph_name = graph_spec.name node_path_resolved = os.path.join(data_path, graph_name, 'node_list_resolved') edge_path_resolved = os.path.join(data_path, graph_name, 'edge_list_resolved') with get_spark_context(config['SparkConfiguration']) as spark_ctx: sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx)) graph = get_graph_dataframes(graph_spec, sql_context, node_path_resolved, edge_path_resolved, DATA_FORMAT) assert 'node_list' in graph assert 'edge_list' in graph assert len(graph['node_list']) == len(graph_spec.node_lists) for cur_node_list in graph_spec.node_lists: assert cur_node_list.safe_name in graph['node_list'] assert len(graph['edge_list']) == len(graph_spec.edge_lists) for cur_edge_list in graph_spec.edge_lists: assert cur_edge_list.safe_name in graph['edge_list'] # Test graph_construction_coi.data_loading with get_spark_context(config['SparkConfiguration']) as spark_ctx: sql_context = SQLContext(spark_ctx, sparkSession=SparkSession(spark_ctx)) tables = load_node_edge_lists(sql_context, graph_spec, node_path_resolved, edge_path_resolved, DATA_FORMAT) for cur_edge_list in graph_spec.edge_lists: assert (cur_edge_list.safe_table_name, cur_edge_list.source_column.safe_name, cur_edge_list.target_column.safe_name) in tables assert len(tables) == len(graph_spec.node_lists) + len(graph_spec.edge_lists) logging.info("Completed run_tests()")