def main():

    update_markers = mongo_get_update_markers()

    # Make sure the updates have all mongo classes
    bulk_tasks = [
        MongoCollectionIndexTask,
        MongoCollectionEventTask,
        MongoCatalogueTask,
        MongoTaxonomyTask,
        # MongoMultimediaTask,
        MongoSiteTask,
        UnpublishTask,
        MongoDeleteTask
    ]

    def _get_task_names(tasks):
        """
        We need to initiate and get the family name, not just the class name
        MongoDeleteTask => DeleteTask
        @param tasks:
        @return:
        """
        return [unicode(task(date=0).task_family) for task in tasks]

    full_export_date = int(config.get('keemu', 'full_export_date'))

    for date, update_marker in update_markers.iteritems():

        #  If this is the fll export date, MongoDeleteTask is not required
        if full_export_date and date == full_export_date:
            bulk_task_copy = list(bulk_tasks)
            bulk_task_copy.remove(MongoDeleteTask)
            bulk_task_names = _get_task_names(bulk_task_copy)
        else:
            bulk_task_names = _get_task_names(bulk_tasks)

        # Assert that for every date we have all the bulk tasks
        missing_tasks = list(set(bulk_task_names) - set(update_marker))
        assert missing_tasks == [], 'There are missing mongo tasks for date %s: %s' % (date, missing_tasks)

    # Get a list of all export files to process
    export_dates = [d for d in get_export_file_dates() if d not in update_markers.keys()]

    # Run setup_interface_logging to ensure luigi commands
    setup_interface_logging()

    sch = scheduler.CentralPlannerScheduler()

    w = BulkWorker(scheduler=sch)

    for export_date in export_dates:

        log.info('Processing date %s', export_date)
        # We only need to call the mongo delete task, as all other tasks are a requirement
        # NB: This doesn't delete anything from CKAN - if that's needed change this to DeleteTask
        w.add(MongoDeleteTask(date=export_date, force=True))
        w.run()
        w.stop()
Example #2
0
    def ensure_export_date(self, date):
        """
        If cron fails to run for whatever reason, and then reruns the next week, it could be mised
        So when calling this dataset, ensure that all preceding mongo exports have been processed
        @param date: date to check
        @return: None
        """

        def filter_dates(d):
            return d < date

        # Get a list of export files dates and marker dates, prior to the current date being processed
        export_file_dates = filter(filter_dates, get_export_file_dates())
        update_marker_dates = filter(filter_dates, mongo_get_update_markers().keys())
        assert export_file_dates == update_marker_dates, 'Outstanding previous export file dates need to be processed first: %s' % list(set(export_file_dates) - set(update_marker_dates))
Example #3
0
    def ensure_export_date(self, date):
        """
        If cron fails to run for whatever reason, and then reruns the next week, it could be mised
        So when calling this dataset, ensure that all preceding mongo exports have been processed
        @param date: date to check
        @return: None
        """
        def filter_dates(d):
            return d < date

        # Get a list of export files dates and marker dates, prior to the current date being processed
        export_file_dates = filter(filter_dates, get_export_file_dates())
        update_marker_dates = filter(filter_dates,
                                     mongo_get_update_markers().keys())
        assert export_file_dates == update_marker_dates, 'Outstanding previous export file dates need to be processed first: %s' % list(
            set(export_file_dates) - set(update_marker_dates))
Example #4
0
def get_export_file_date():
    """
    Get the oldest export file date that hasn't run
    :return:
    """

    update_markers = mongo_get_update_markers()
    completed_dates = []

    # Check all tasks for a particular date have run correctly
    # If they have, add date to completed date
    for date, tasks in update_markers.items():
        for task in MainTask.tasks:
            if task.task_family not in tasks:
                break
            completed_dates.append(date)

    # Loop through all available export file dates, and return the
    # first one we don't have an update marker for
    export_file_dates = get_export_file_dates()

    for export_file_date in export_file_dates:
        if export_file_date not in completed_dates:
            return export_file_date
Example #5
0
def get_export_file_date():
    """
    Get the oldest export file date that hasn't run
    :return:
    """

    update_markers = mongo_get_update_markers()
    completed_dates = []

    # Check all tasks for a particular date have run correctly
    # If they have, add date to completed date
    for date, tasks in update_markers.items():
        for task in MainTask.tasks:
            if task.task_family not in tasks:
                break
            completed_dates.append(date)

    # Loop through all available export file dates, and return the
    # first one we don't have an update marker for
    export_file_dates = get_export_file_dates()

    for export_file_date in export_file_dates:
        if export_file_date not in completed_dates:
            return export_file_date
Example #6
0
def main():

    update_markers = mongo_get_update_markers()

    # Make sure the updates have all mongo classes
    bulk_tasks = [
        MongoCollectionIndexTask,
        MongoCollectionEventTask,
        MongoCatalogueTask,
        MongoTaxonomyTask,
        # MongoMultimediaTask,
        MongoSiteTask,
        UnpublishTask,
        MongoDeleteTask
    ]

    def _get_task_names(tasks):
        """
        We need to initiate and get the family name, not just the class name
        MongoDeleteTask => DeleteTask
        @param tasks:
        @return:
        """
        return [unicode(task(date=0).task_family) for task in tasks]

    full_export_date = int(config.get('keemu', 'full_export_date'))

    for date, update_marker in update_markers.iteritems():

        #  If this is the fll export date, MongoDeleteTask is not required
        if full_export_date and date == full_export_date:
            bulk_task_copy = list(bulk_tasks)
            bulk_task_copy.remove(MongoDeleteTask)
            bulk_task_names = _get_task_names(bulk_task_copy)
        else:
            bulk_task_names = _get_task_names(bulk_tasks)

        # Assert that for every date we have all the bulk tasks
        missing_tasks = list(set(bulk_task_names) - set(update_marker))
        assert missing_tasks == [], 'There are missing mongo tasks for date %s: %s' % (
            date, missing_tasks)

    # Get a list of all export files to process
    export_dates = [
        d for d in get_export_file_dates() if d not in update_markers.keys()
    ]

    # Run setup_interface_logging to ensure luigi commands
    setup_interface_logging()

    sch = scheduler.CentralPlannerScheduler()

    w = BulkWorker(scheduler=sch)

    for export_date in export_dates:

        log.info('Processing date %s', export_date)
        # We only need to call the mongo delete task, as all other tasks are a requirement
        # NB: This doesn't delete anything from CKAN - if that's needed change this to DeleteTask
        w.add(MongoDeleteTask(date=export_date, force=True))
        w.run()
        w.stop()