def import_bulk(data_source, book_keeper): """ Imports bulk data from the given data source. It can perform both 'full import' as well as 'incremental update'. :param data_source: Data source to read input from :param book_keeper: Book keeper to get info about recently ingested data :return: None """ try: # Now, get the last incremental update timestamp from the graph. graph_meta = GraphPopulator.get_metadata() # If the timestamp is unknown then it means graph is not populated yet and we need to do full import. list_keys = [] if graph_meta is None: # Collect all the files from data-source and group them by package-version. logger.debug("Performing full import. Fetching all objects from : " + data_source.get_source_name()) list_keys = data_source.list_files() # else if the timestamp is available then we need to perform incremental update. else: if book_keeper is None: raise RuntimeError("Cannot perform incremental update without book keeper!") # Collect all the package-version from RDS table that were updated recently. # Note: If RDS table is unreachable then we should still live with S3 data. min_finished_at = graph_meta.last_incr_update_ts list_epv = book_keeper.get_recent_epv(min_finished_at) # Collect relevant files from data-source and group them by package-version. logger.debug("Performing incremental update. Fetching some objects from : " + data_source.get_source_name()) for epv in list_epv: key_prefix = epv.get('ecosystem') + "/" + epv.get('name') + "/" + epv.get('version') list_keys.extend(data_source.list_files(prefix=key_prefix)) # end of if graph_meta is None: # Import the S3 data dict_grouped_keys = _group_keys_by_epv(list_keys, data_source) report = _import_grouped_keys(data_source, dict_grouped_keys) # In the end, update the meta-data in the graph. if report.get('max_finished_at') is not None: dict_graph_meta = { 'last_incremental_update_timestamp': report.get('max_finished_at'), 'last_imported_epv': report.get('last_imported_EPV') } GraphPopulator.update_metadata(dict_graph_meta) _log_report_msg("import_bulk()", report) except Exception as e: msg = _get_exception_msg("import_bulk() failed with error", e) raise RuntimeError(msg) return report
def test_construct_version_query_4(): """Test the GraphPopulator.construct_version_query() class method.""" input_json = { "version": "0.4.59", "package": "access_points", "ecosystem": "pypi", "analyses": { "metadata": {"details": [ {"description": "Some description here", "declared_license": "GPL and\nv2.0"} ]} } } q = GraphPopulator.construct_version_query(input_json) logger.info(q) assert "access_points" in q assert "0.4.59" in q assert "pypi" in q input_json = { "version": "deb579d6e030503f430978ee229008b9bc912d40", "package": "github.com/gorilla/mux", "ecosystem": "go", "analyses": { "source_licenses": { "status": "success", "summary": { "sure_licenses": [ "BSD-Modified" ] } }, "metadata": { "details": [ { "code_repository": { "type": "git", "url": "https://github.com/gorilla/mux" }, "dependencies": [], "ecosystem": "gofedlib", "name": "github.com/gorilla/mux", "version": "deb579d6e030503f430978ee229008b9bc912d40" } ] } } } q = GraphPopulator.construct_version_query(input_json) assert "'declared_licenses'" in q assert "'licenses'" in q assert "BSD-Modified" in q
def _import_grouped_keys(data_source, dict_grouped_keys): logger.debug("Begin import...") date_time_format = "%Y-%m-%dT%H:%M:%S.%f" report = {'status': 'Success', 'message': 'The import finished successfully!'} count_imported_EPVs = 0 max_finished_at = None max_datetime = None last_imported_EPV = None if len(dict_grouped_keys.items()) == 0: report['message'] = 'Nothing to be imported! No data found on S3 to be imported!' try: for counter, v in dict_grouped_keys.items(): first_key = v[0] logger.debug("Importing " + first_key) logger.debug("File---- %s numbered---- %d added:" % (first_key, counter)) obj, cur_finished_at = _first_key_info(data_source, first_key) if obj is None: continue obj_returned = _other_key_info(data_source, other_keys=v[1:]) obj.update(obj_returned) GraphPopulator.populate_from_json(obj) count_imported_EPVs += 1 last_imported_EPV = first_key max_finished_at = _set_max_finished_at(max_finished_at, cur_finished_at, max_datetime, date_time_format) max_datetime = datetime.strptime(max_finished_at, date_time_format) except Exception as e: msg = _get_exception_msg("The import failed", e) report['status'] = 'Failure' report['message'] = msg report['count_imported_EPVs'] = count_imported_EPVs report['last_imported_EPV'] = last_imported_EPV report['max_finished_at'] = max_finished_at return report
def create_pv_nodes(self): """Create Package and Version nodes, if needed.""" for pv_dict in self._cve_dict.get('affected'): epv_dict = pv_dict.copy() epv_dict['ecosystem'] = self._cve_dict.get('ecosystem') query = GraphPopulator.construct_graph_nodes(epv_dict) success, json_response = BayesianGraph.execute(query) e = epv_dict.get('ecosystem') p = epv_dict.get('name') v = epv_dict.get('version') if not success: logger.error('Error creating nodes for {e}/{p}/{v}: {r}'.format( e=e, p=p, v=v, r=str(json_response)) )
def _import_grouped_keys_http(data_source, dict_grouped_keys): logger.debug("Begin import...") date_time_format = "%Y-%m-%dT%H:%M:%S.%f" report = {'status': 'Success', 'message': 'The import finished successfully!'} count_imported_EPVs = 0 max_finished_at = None max_datetime = None last_imported_EPV = None epv = [] if len(dict_grouped_keys.items()) == 0: report['message'] = 'Nothing to be imported! No data found on S3 to be imported!' try: for counter, v in dict_grouped_keys.items(): first_key = v[0] obj, cur_finished_at = _first_key_info(data_source, first_key) if obj is None: continue obj_returned = _other_key_info(data_source, other_keys=v[1:]) obj.update(obj_returned) str_gremlin = GraphPopulator.create_query_string(obj) logger.debug("Importing " + first_key) logger.debug("File---- %s numbered---- %d added:" % (first_key, counter)) # Fire Gremlin HTTP query now logger.info("Ingestion initialized for EPV - " + obj.get('ecosystem') + ":" + obj.get('package') + ":" + obj.get('version')) epv.append(obj.get('ecosystem') + ":" + obj.get('package') + ":" + obj.get('version')) payload = {'gremlin': str_gremlin} response = requests.post(config.GREMLIN_SERVER_URL_REST, data=json.dumps(payload)) resp = response.json() if resp['status']['code'] == 200: count_imported_EPVs += 1 last_imported_EPV = first_key max_finished_at = _set_max_finished_at(max_finished_at, cur_finished_at, max_datetime, date_time_format) max_datetime = datetime.strptime(max_finished_at, date_time_format) report['epv'] = epv except Exception as e: msg = _get_exception_msg("The import failed", e) report['status'] = 'Failure' report['message'] = msg report['count_imported_EPVs'] = count_imported_EPVs report['last_imported_EPV'] = last_imported_EPV report['max_finished_at'] = max_finished_at return report
def test_construct_version_query_1(): """Test the GraphPopulator.construct_version_query() class method.""" input_json = { "version": "0.4.59", "package": "access_points", "ecosystem": "pypi", "analyses": { "metadata": {} } } q = GraphPopulator.construct_version_query(input_json) logger.info(q) assert "access_points" in q assert "0.4.59" in q assert "pypi" in q assert "addVertex" in q assert "drop()" not in q
def test_construct_version_query_3(): """Test the GraphPopulator.construct_version_query() class method.""" input_json = { "version": "0.4.59", "package": "access_points", "ecosystem": "pypi", "analyses": { "metadata": {"details": [ {"description": "Some description here", "declared_license": "GPL \nv2.0"} ]} } } q = GraphPopulator.construct_version_query(input_json) logger.info(q) assert "access_points" in q assert "0.4.59" in q assert "pypi" in q
def test_construct_version_query_2(): """Test the GraphPopulator.construct_version_query() class method.""" input_json = { "version": "0.4.59", "package": "access_points", "ecosystem": "pypi", "analyses": { "metadata": {"details": [ {"description": "Some description here", "declared_licenses": ["GPL v3", "APL v2.0"]} ]}, "github_details": {}, 'libraries_io': {}, 'source_licenses': {}, 'security_issues': { "details": [ {"id": "CEV-007", "cvss": {"score": 9.7}} ] }, 'code_metrics': {"details": {"languages": [{ "metrics": { "functions": { 'average_cyclomatic_complexity': 3 } } }]}}, 'redhat_downstream': { "summary": { "all_rhsm_product_names": ["access_points_rh"] } } } } q = GraphPopulator.construct_version_query(input_json) logger.info(q) assert "access_points" in q assert "0.4.59" in q assert "pypi" in q
def create_graph_nodes(list_epv): """Create blank graph nodes given an EPV.""" count_blank_epvs_created = 0 success_epvs = [] failure_epvs = [] for item in list_epv: str_gremlin = GraphPopulator.construct_graph_nodes(item) epv = item.get('ecosystem') + ":" + item.get('name') + ":" + item.get( 'version') if str_gremlin: payload = {'gremlin': str_gremlin} print(json.dumps(payload)) try: result = requests.post(config.GREMLIN_SERVER_URL_REST, data=json.dumps(payload), timeout=30) resp = result.json() print(json.dumps(resp)) if resp['status']['code'] == 200: count_blank_epvs_created += 1 success_epvs.append(epv) except Exception as e: # pragma: no cover logger.error(e) failure_json = {epv: e} failure_epvs.append(failure_json) status = "Success" if count_blank_epvs_created == 0: status = "Failure" response = { "epv_nodes_created": count_blank_epvs_created, "success_list": success_epvs, "failure_list": failure_epvs, "status": status } return response
def handle_properties(ecosystem, package, version): """ Handle (update/delete) properties associated with given EPV. Update replaces properties with the same name. Expects JSON payload in following format: { "properties": [ { "name": "cve_ids", "value": "CVE-3005-0001:10" } ] } "value" can be omitted in DELETE requests. :param ecosystem: str, ecosystem :param package: str, package name :param version: str, package version :return: 200 on success, 400 on failure """ # TODO: reduce cyclomatic complexity input_json = request.get_json() properties = input_json.get('properties') error = flask.jsonify({'error': 'invalid input'}) if not properties: return error, 400 input_json = { k: GraphPopulator.sanitize_text_for_query(str(v)) for k, v in input_json.items() } if request.method == 'PUT': if [ x for x in properties if not x.get('name') or x.get('value') is None ]: return error, 400 log_msg = '[{m}] Updating properties for {e}/{p}/{v} with payload {b}' current_app.logger.info( log_msg.format(m=request.method, e=ecosystem, p=package, v=version, b=input_json)) query_statement = "g.V()" \ ".has('pecosystem','{ecosystem}')" \ ".has('pname','{pkg_name}')" \ ".has('version','{version}')".format(ecosystem=ecosystem, pkg_name=package, version=version) statement = '' if request.method in ('DELETE', 'PUT'): # build "delete" part of the statement drop_str = "" for prop in properties: drop_str += query_statement drop_str += ".properties('{property}').drop().iterate();".format( property=prop['name']) statement += drop_str if request.method == 'PUT': # build "add" part of the statement add_str = "" for prop in properties: add_str += ".property('{property}','{value}')".format( property=prop['name'], value=prop['value']) statement += query_statement + add_str + ';' current_app.logger.info('Gremlin statement: {s}'.format(s=statement)) success, response_json = BayesianGraph.execute(statement) if not success: current_app.logger.error( "Failed to update properties for {e}/{p}/{v}".format(e=ecosystem, p=package, v=version)) return flask.jsonify(response_json), 400 return flask.jsonify(response_json), 200
def test_full_import_and_incr_update(): data_dir = 'test/data' # Let us make sure that target graph has no metadata graph_meta = GraphPopulator.get_metadata() assert (graph_meta is None) # Full import: insert all the EPVs from the given data source src_dir = os.path.join(data_dir, 'full_import') report = import_bulk(data_source=LocalFileSystem(src_dir=src_dir), book_keeper=None) assert (report.get('status') == 'Success') assert (report.get('count_imported_EPVs') == 1) assert (report.get('last_imported_EPV') == 'npm/serve-static/1.7.1.json') assert (report.get('max_finished_at') == '2017-02-08T12:26:51.962609') graph_meta = GraphPopulator.get_metadata() assert (graph_meta is not None) assert (graph_meta.last_incr_update_ts == '2017-02-08T12:26:51.962609') # Incremental update 1: # Let us mimic a scenario where a new EPV was inserted recently: npm/send/0.10.1 src_dir = os.path.join(data_dir, 'incr_update1') book_keeping_json = os.path.join(data_dir, 'book_keeping1.json') report = import_bulk( data_source=LocalFileSystem(src_dir=src_dir), book_keeper=JsonBookKeeper(json_file_name=book_keeping_json)) assert (report.get('status') == 'Success') assert (report.get('count_imported_EPVs') == 1) assert (report.get('last_imported_EPV') == 'npm/send/0.10.1.json') assert (report.get('max_finished_at') == '2017-02-22T15:34:59.469864') graph_meta = GraphPopulator.get_metadata() assert (graph_meta is not None) assert (graph_meta.last_incr_update_ts == '2017-02-22T15:34:59.469864') # Incremental update 2: # Let us mimic a scenario where a new EPV was inserted recently: npm/parseurl/1.3.1 # and also an already existing EPV was updated recently: npm/serve-static/1.7.1 src_dir = os.path.join(data_dir, 'incr_update2') book_keeping_json = os.path.join(data_dir, 'book_keeping2.json') report = import_bulk( data_source=LocalFileSystem(src_dir=src_dir), book_keeper=JsonBookKeeper(json_file_name=book_keeping_json)) assert (report.get('status') == 'Success') assert (report.get('count_imported_EPVs') == 2) assert (report.get('last_imported_EPV') == 'npm/serve-static/1.7.1.json') assert (report.get('max_finished_at') == '2017-02-22T15:35:51.962609') graph_meta = GraphPopulator.get_metadata() assert (graph_meta is not None) assert (graph_meta.last_incr_update_ts == '2017-02-22T15:35:51.962609') # Cleanup GraphMetaData.delete_all() assert (GraphMetaData.count() == 0) LicenseDetails.delete_all() assert (LicenseDetails.count() == 0) Author.delete_all() assert (Author.count() == 0) CodeMetricsResult.delete_all() assert (CodeMetricsResult.count() == 0) CodeMetricsLanguage.delete_all() assert (CodeMetricsLanguage.count() == 0) GithubResult.delete_all() assert (GithubResult.count() == 0) Contributor.delete_all() assert (Contributor.count() == 0) Package.delete_all() assert (Package.count() == 0) Version.delete_all() assert (Version.count() == 0)
def update_graph_metadata(input_json): GraphPopulator.update_metadata(input_json)
def test_construct_package_query(): """Test the GraphPopulator.construct_package_query() class method.""" input_json = { "version": "0.4.59", "package": "access_points", "ecosystem": "pypi", "analyses": { "metadata": {"details": [ {"description": "Some description here"} ]}, "github_details": {}, 'libraries_io': {'schema': {'version': '2-0-0'}, 'details': {'releases': { 'count': 2, 'recent': [{ "published_at": "2016-09-09"} ], "published_at": "2016-09-09" }}} } } str_package, prp_package = GraphPopulator.construct_package_query(input_json) logger.info(str_package, prp_package) assert 'access_points' in str_package assert "pypi" in str_package input_json = { "version": "0.4.59", "package": "access_points", "ecosystem": "pypi", "analyses": { "metadata": {"details": [ {"description": "Some description here"} ]}, "github_details": {}, 'libraries_io': {} } } str_package, prp_package = GraphPopulator.construct_package_query(input_json) logger.info(str_package, prp_package) assert 'access_points' in str_package assert "pypi" in str_package input_json = { "version": "0.4.59", "package": "access_points", "ecosystem": "pypi", "analyses": { "metadata": {"details": [ {"description": "Some description here"} ]}, "github_details": {}, 'libraries_io': {'schema': {'version': '1-0-0'}, 'details': {'releases': { 'count': 2, 'recent': [{"published_at": "2016-09-09"}], 'latest': { 'recent': { "0.4.59": "2016-09-09" } }, "published_at": "2016-09-09" }}} } } str_package, prp_package = GraphPopulator.construct_package_query(input_json) logger.info([str_package, prp_package]) assert 'access_points' in str_package assert "pypi" in str_package
def _import_keys_from_s3_http(data_source, epv_list): logger.debug("Begin import...") report = { 'status': 'Success', 'message': 'The import finished successfully!' } count_imported_EPVs = 0 last_imported_EPV = None epv = [] for epv_key in epv_list: for key, contents in epv_key.items(): if len(contents.get('pkg_list_keys')) == 0 and len( contents.get('ver_list_keys')) == 0: report[ 'message'] = 'Nothing to be imported! No data found on S3 to be imported!' continue obj = { 'ecosystem': contents.get('ecosystem'), 'package': contents.get('package'), 'version': contents.get('version') } try: # Check other Version level information and add it to common object if len(contents.get('ver_list_keys')) > 0: first_key = contents['ver_key_prefix'] + '.json' first_obj = _first_key_info(data_source, first_key, config.AWS_EPV_BUCKET) obj.update(first_obj) ver_obj = _other_key_info(data_source, contents.get('ver_list_keys'), config.AWS_EPV_BUCKET) if 'analyses' in obj: obj.get('analyses', {}).update(ver_obj['analyses']) else: obj.update(ver_obj) # Check Package related information and add it to package object if len(contents.get('pkg_list_keys')) > 0: pkg_obj = _other_key_info(data_source, contents.get('pkg_list_keys'), config.AWS_PKG_BUCKET) if 'analyses' in obj: obj.get('analyses', {}).update(pkg_obj['analyses']) else: obj.update(pkg_obj) # Create Gremlin Query str_gremlin = GraphPopulator.create_query_string(obj) if str_gremlin: # Fire Gremlin HTTP query now logger.info("Ingestion initialized for EPV - " + obj.get('ecosystem') + ":" + obj.get('package') + ":" + obj.get('version')) epv.append( obj.get('ecosystem') + ":" + obj.get('package') + ":" + obj.get('version')) payload = {'gremlin': str_gremlin} response = requests.post(config.GREMLIN_SERVER_URL_REST, data=json.dumps(payload), timeout=30) resp = response.json() if resp['status']['code'] == 200: count_imported_EPVs += 1 last_imported_EPV = (obj.get('ecosystem') + ":" + obj.get('package') + ":" + obj.get('version')) except Exception as e: msg = _get_exception_msg("The import failed", e) report['status'] = 'Failure' report['message'] = msg report['epv'] = epv_key report['epv'] = epv_list report['count_imported_EPVs'] = count_imported_EPVs if count_imported_EPVs == 0 and report['status'] == 'Success': report['message'] = 'Nothing to be synced to Graph!' report['last_imported_EPV'] = last_imported_EPV return report
def _import_keys_from_s3_http(data_source, epv_list): # TODO: reduce cyclomatic complexity logger.debug("Begin import...") report = { 'status': 'Success', 'message': 'The import finished successfully!' } count_imported_EPVs = 0 last_imported_EPV = None epv = [] for epv_key in epv_list: for key, contents in epv_key.items(): if len(contents.get('pkg_list_keys')) == 0 and len( contents.get('ver_list_keys')) == 0: report[ 'message'] = 'Nothing to be imported! No data found on S3 to be imported!' continue pkg_ecosystem = contents.get('ecosystem') pkg_name = contents.get('package') pkg_version = contents.get('version') or '' pkg_source = contents.get('source_repo', pkg_ecosystem) obj = { 'ecosystem': pkg_ecosystem, 'package': pkg_name, 'version': pkg_version, 'source_repo': pkg_source } try: # Check other Version level information and add it to common object if len(contents.get('ver_list_keys')) > 0: first_key = contents['ver_key_prefix'] + '.json' first_obj = _first_key_info(data_source, first_key, config.AWS_EPV_BUCKET) obj.update(first_obj) ver_obj = _other_key_info(data_source, contents.get('ver_list_keys'), config.AWS_EPV_BUCKET) if 'analyses' in obj: obj.get('analyses', {}).update(ver_obj['analyses']) else: obj.update(ver_obj) # Check Package related information and add it to package object if len(contents.get('pkg_list_keys')) > 0: pkg_obj = _other_key_info(data_source, contents.get('pkg_list_keys'), config.AWS_PKG_BUCKET) if 'analyses' in obj: obj.get('analyses', {}).update(pkg_obj['analyses']) else: obj.update(pkg_obj) # Create Gremlin Query str_gremlin = GraphPopulator.create_query_string(obj) if str_gremlin: # Fire Gremlin HTTP query now epv_full = pkg_ecosystem + ":" + pkg_name + ":" + pkg_version logger.info("Ingestion initialized for EPV - %s" % epv_full) epv.append(epv_full) payload = {'gremlin': str_gremlin} response = requests.post(config.GREMLIN_SERVER_URL_REST, data=json.dumps(payload), timeout=30) resp = response.json() if resp['status']['code'] == 200: count_imported_EPVs += 1 last_imported_EPV = (obj.get('ecosystem') + ":" + obj.get('package') + ":" + obj.get('version')) # update first key with graph synced tag logger.info("Mark as synced in RDS %s" % last_imported_EPV) if not config.AWS_S3_IS_LOCAL: # pragma: no cover PostgresHandler().mark_epv_synced( obj.get('ecosystem'), obj.get('package'), obj.get('version')) except Exception as e: # pragma: no cover logger.error(e) msg = _get_exception_msg("The import failed", e) report['status'] = 'Failure' report['message'] = msg report['epv'] = epv_key report['epv'] = epv_list report['count_imported_EPVs'] = count_imported_EPVs if count_imported_EPVs == 0 and report['status'] == 'Success': report['message'] = 'Nothing to be synced to Graph!' report['last_imported_EPV'] = last_imported_EPV return report