def import_bulk(data_source, book_keeper):
    """
    Imports bulk data from the given data source.
    It can perform both 'full import' as well as 'incremental update'.

    :param data_source: Data source to read input from
    :param book_keeper: Book keeper to get info about recently ingested data
    :return: None
    """
    try:
        # Now, get the last incremental update timestamp from the graph.
        graph_meta = GraphPopulator.get_metadata()

        # If the timestamp is unknown then it means graph is not populated yet and we need to do full import.
        list_keys = []
        if graph_meta is None:
            # Collect all the files from data-source and group them by package-version.
            logger.debug("Performing full import. Fetching all objects from : " + data_source.get_source_name())
            list_keys = data_source.list_files()

        # else if the timestamp is available then we need to perform incremental update.
        else:
            if book_keeper is None:
                raise RuntimeError("Cannot perform incremental update without book keeper!")

            # Collect all the package-version from RDS table that were updated recently.
            # Note: If RDS table is unreachable then we should still live with S3 data.
            min_finished_at = graph_meta.last_incr_update_ts
            list_epv = book_keeper.get_recent_epv(min_finished_at)

            # Collect relevant files from data-source and group them by package-version.
            logger.debug("Performing incremental update. Fetching some objects from : " + data_source.get_source_name())
            for epv in list_epv:
                key_prefix = epv.get('ecosystem') + "/" + epv.get('name') + "/" + epv.get('version')
                list_keys.extend(data_source.list_files(prefix=key_prefix))
        # end of if graph_meta is None:

        # Import the S3 data
        dict_grouped_keys = _group_keys_by_epv(list_keys, data_source)
        report = _import_grouped_keys(data_source, dict_grouped_keys)

        # In the end, update the meta-data in the graph.
        if report.get('max_finished_at') is not None:
            dict_graph_meta = {
                'last_incremental_update_timestamp': report.get('max_finished_at'),
                'last_imported_epv': report.get('last_imported_EPV')
            }
            GraphPopulator.update_metadata(dict_graph_meta)
        _log_report_msg("import_bulk()", report)

    except Exception as e:
        msg = _get_exception_msg("import_bulk() failed with error", e)
        raise RuntimeError(msg)

    return report
Exemple #2
0
def test_full_import_and_incr_update():
    data_dir = 'test/data'
    # Let us make sure that target graph has no metadata
    graph_meta = GraphPopulator.get_metadata()
    assert (graph_meta is None)

    # Full import: insert all the EPVs from the given data source
    src_dir = os.path.join(data_dir, 'full_import')
    report = import_bulk(data_source=LocalFileSystem(src_dir=src_dir),
                         book_keeper=None)
    assert (report.get('status') == 'Success')
    assert (report.get('count_imported_EPVs') == 1)
    assert (report.get('last_imported_EPV') == 'npm/serve-static/1.7.1.json')
    assert (report.get('max_finished_at') == '2017-02-08T12:26:51.962609')

    graph_meta = GraphPopulator.get_metadata()
    assert (graph_meta is not None)
    assert (graph_meta.last_incr_update_ts == '2017-02-08T12:26:51.962609')

    # Incremental update 1:
    # Let us mimic a scenario where a new EPV was inserted recently: npm/send/0.10.1
    src_dir = os.path.join(data_dir, 'incr_update1')
    book_keeping_json = os.path.join(data_dir, 'book_keeping1.json')
    report = import_bulk(
        data_source=LocalFileSystem(src_dir=src_dir),
        book_keeper=JsonBookKeeper(json_file_name=book_keeping_json))
    assert (report.get('status') == 'Success')
    assert (report.get('count_imported_EPVs') == 1)
    assert (report.get('last_imported_EPV') == 'npm/send/0.10.1.json')
    assert (report.get('max_finished_at') == '2017-02-22T15:34:59.469864')

    graph_meta = GraphPopulator.get_metadata()
    assert (graph_meta is not None)
    assert (graph_meta.last_incr_update_ts == '2017-02-22T15:34:59.469864')

    # Incremental update 2:
    # Let us mimic a scenario where a new EPV was inserted recently: npm/parseurl/1.3.1
    # and also an already existing EPV was updated recently: npm/serve-static/1.7.1
    src_dir = os.path.join(data_dir, 'incr_update2')
    book_keeping_json = os.path.join(data_dir, 'book_keeping2.json')
    report = import_bulk(
        data_source=LocalFileSystem(src_dir=src_dir),
        book_keeper=JsonBookKeeper(json_file_name=book_keeping_json))
    assert (report.get('status') == 'Success')
    assert (report.get('count_imported_EPVs') == 2)
    assert (report.get('last_imported_EPV') == 'npm/serve-static/1.7.1.json')
    assert (report.get('max_finished_at') == '2017-02-22T15:35:51.962609')

    graph_meta = GraphPopulator.get_metadata()
    assert (graph_meta is not None)
    assert (graph_meta.last_incr_update_ts == '2017-02-22T15:35:51.962609')

    # Cleanup
    GraphMetaData.delete_all()
    assert (GraphMetaData.count() == 0)

    LicenseDetails.delete_all()
    assert (LicenseDetails.count() == 0)

    Author.delete_all()
    assert (Author.count() == 0)

    CodeMetricsResult.delete_all()
    assert (CodeMetricsResult.count() == 0)

    CodeMetricsLanguage.delete_all()
    assert (CodeMetricsLanguage.count() == 0)

    GithubResult.delete_all()
    assert (GithubResult.count() == 0)

    Contributor.delete_all()
    assert (Contributor.count() == 0)

    Package.delete_all()
    assert (Package.count() == 0)

    Version.delete_all()
    assert (Version.count() == 0)