def update_registry(ignore_hashes):
    """
    Get all dataset metadata and update dataset data.
    """
    queue = rq.get_queue()
    datasets = fetch_dataset_list()
    print("Enqueuing %d datasets for update" % datasets.count())
    for dataset in datasets:
        queue.enqueue(update_dataset,
                      args=(dataset.name, ignore_hashes),
                      result_ttl=0)
def download_and_update_cmd(ignore_hashes):
    """
    Enqueue a download of all IATI data from
    IATI Data Dump, and then start an update.
    """
    queue = rq.get_queue()
    print("Enqueuing a download from IATI Data Dump")
    queue.enqueue(download_and_update,
                  args=(ignore_hashes, ),
                  result_ttl=0,
                  job_timeout=100000)
Exemple #3
0
def about():
    # General status info
    count_activities = db.session.query(
        Stats.count).filter_by(label='activities').scalar()

    count_transactions = db.session.query(
        Stats.count).filter_by(label='transactions').scalar()

    count_budgets = db.session.query(
        Stats.count).filter_by(label='budgets').scalar()

    count_datasets = Dataset.query.count()

    # Check last updated times

    updated = db.session.query(
        sa.func.max(Resource.last_fetch).label('last_fetch'),
        sa.func.max(Resource.last_succ).label('last_succ'),
        sa.func.max(Resource.last_parsed).label('last_parsed')).first()
    now = datetime.now()
    # If the file was last fetched less than 2 days
    # ago and parsed less than 1 day
    # ago, then the API is healthy.
    if ((updated.last_fetch is not None) and (updated.last_succ is not None)
            and (updated.last_parsed is not None)):
        healthy = (((now - updated.last_fetch).days < 2)
                   and ((now - updated.last_succ).days < 2)
                   and ((now - updated.last_parsed).days < 1))
    else:
        healthy = False

    # Number of items on the queue
    items_on_queue = rq.get_queue().count

    return jsonify(ok=healthy,
                   status={
                       True: 'healthy',
                       False: 'unhealthy'
                   }[healthy],
                   status_data={
                       'last_fetch': updated.last_fetch,
                       'last_successful_fetch': updated.last_succ,
                       'last_parsed': updated.last_parsed
                   },
                   indexed_activities=count_activities,
                   indexed_transactions=count_transactions,
                   indexed_budgets=count_budgets,
                   num_datasets=count_datasets,
                   items_on_queue=items_on_queue)
def update_cmd(ignore_hashes, dataset=None):
    """
    Step through downloaded datasets, adding them to the dataset table, and then adding an update command to
    the Flask job queue. See update_registry, then update_dataset for next actions.
    """
    queue = rq.get_queue()

    if dataset is not None:
        print("Enqueuing {0} for update".format(dataset))
        queue.enqueue(update_dataset,
                      args=(dataset, ignore_hashes),
                      result_ttl=0)
    else:
        print("Enqueuing a full registry update")
        queue.enqueue(update_registry, args=(ignore_hashes, ), result_ttl=0)
def update_dataset(dataset_name, ignore_hashes):
    '''
    Takes the dataset name and determines whether or not an update is needed based on whether or not the last
    successful update detail exits, and whether or not it last updated since the contained data was updated.
    If ignore_hashes is set to true, an update will be triggered, regardless of whether there appears
    to be any change in the dataset hash compared with that stored in the database.
    :param dataset_name:
    :param ignore_hashes:
    :return:
    '''
    # clear up previous job queue log errors
    db.session.query(Log).filter(
        sa.and_(
            Log.logger == 'job iatilib.crawler.update_dataset',
            Log.dataset == dataset_name,
        )).delete(synchronize_session=False)
    db.session.commit()

    queue = rq.get_queue()
    dataset = Dataset.query.get(dataset_name)

    fetch_dataset_metadata(dataset)
    try:
        db.session.commit()
    except sa.exc.IntegrityError as exc:
        db.session.rollback()
        # the resource can't be added, so we should
        # give up.
        db.session.add(
            Log(dataset=dataset_name,
                resource=None,
                logger="update_dataset",
                msg="Failed to update dataset {0}, error was".format(
                    dataset_name, exc),
                level="error",
                trace=traceback.format_exc(),
                created_at=datetime.datetime.now()))
        db.session.commit()
        return

    resource = fetch_resource(dataset, ignore_hashes)
    db.session.commit()

    if resource.last_status_code == 200 and not resource.last_parsed:
        queue.enqueue(update_activities,
                      args=(dataset_name, ),
                      result_ttl=0,
                      job_timeout=100000)
def status_cmd():
    """Show status of current jobs"""
    print("%d jobs on queue" % rq.get_queue().count)

    print(
        status_line(
            "datasets have no metadata",
            Dataset.query.filter_by(last_modified=None),
            Dataset.query,
        ))

    print(
        status_line(
            "datasets not seen in the last day",
            Dataset.query.filter(
                Dataset.last_seen < (datetime.datetime.utcnow() -
                                     datetime.timedelta(days=1))),
            Dataset.query,
        ))

    print(
        status_line(
            "resources have had no attempt to fetch",
            Resource.query.outerjoin(Dataset).filter(
                Resource.last_fetch == None),
            Resource.query,
        ))

    print(
        status_line(
            "resources not successfully fetched",
            Resource.query.outerjoin(Dataset).filter(
                Resource.last_succ == None),
            Resource.query,
        ))

    print(
        status_line(
            "resources not fetched since modification",
            Resource.query.outerjoin(Dataset).filter(
                sa.or_(Resource.last_succ == None,
                       Resource.last_succ < Dataset.last_modified)),
            Resource.query,
        ))

    print(
        status_line(
            "resources not parsed since mod",
            Resource.query.outerjoin(Dataset).filter(
                sa.or_(Resource.last_succ == None,
                       Resource.last_parsed < Dataset.last_modified)),
            Resource.query,
        ))

    print(
        status_line(
            "resources have no activites",
            db.session.query(Resource.url).outerjoin(Activity).group_by(
                Resource.url).having(
                    sa.func.count(Activity.iati_identifier) == 0),
            Resource.query))

    print("")

    total_activities = Activity.query.count()
    # out of date activitiy was created < resource last_parsed
    total_activities_fetched = Activity.query.join(Resource).filter(
        Activity.created < Resource.last_parsed).count()
    try:
        ratio = 1.0 * total_activities_fetched / total_activities
    except ZeroDivisionError:
        ratio = 0.0
    print("{nofetched_c}/{res_c} ({pct:6.2%}) activities out of date".format(
        nofetched_c=total_activities_fetched,
        res_c=total_activities,
        pct=ratio))
Exemple #7
0
def empty():
    "Clear all jobs from queue"
    queue = rq.get_queue()
    queue.empty()