Beispiel #1
0
 def test_deleted_activity_removal(self):
     db.session.add(
         DeletedActivity(iati_identifier='test_deleted_activity',
                         deletion_date=datetime.datetime(2000, 1, 1)))
     db.session.commit()
     resource = fac.ResourceFactory.create(
         url=u"http://test",
         document="""
             <iati-activities>
               <iati-activity>
                 <iati-identifier>test_deleted_activity</iati-identifier>
                 <title>test_deleted_activity</title>
                 <reporting-org ref="GB-CHC-202918" type="21">Oxfam GB</reporting-org>
               </iati-activity>
             </iati-activities>
         """,
     )
     self.assertIn("test_deleted_activity", [
         da.iati_identifier
         for da in db.session.query(DeletedActivity).all()
     ])
     resource = crawler.parse_resource(resource)
     db.session.commit()
     self.assertNotIn(
         "test_deleted_activity",
         [da.iati_identifier for da in DeletedActivity.query.all()])
Beispiel #2
0
def parse_resource(resource):
    db.session.add(resource)
    now = datetime.datetime.utcnow()
    current = Activity.query.filter_by(resource_url=resource.url)
    current_identifiers = set([i.iati_identifier for i in current.all()])

    old_xml = dict([(i[0], (i[1], hash(i[2]))) for i in db.session.query(
        Activity.iati_identifier, Activity.last_change_datetime,
        Activity.raw_xml).filter_by(resource_url=resource.url)])

    db.session.query(Activity).filter_by(resource_url=resource.url).delete()
    new_identifiers = set()
    activities = []
    for activity in parse.document(resource.document, resource):
        activity.resource = resource
        new_identifiers.add(activity.iati_identifier)
        try:
            if hash(activity.raw_xml) == old_xml[activity.iati_identifier][1]:
                activity.last_change_datetime = old_xml[
                    activity.iati_identifier][0]
            else:
                activity.last_change_datetime = datetime.datetime.now()
        except KeyError:
            activity.last_change_datetime = datetime.datetime.now()
        activities.append(activity)
        db.session.add(activity)
        if len(db.session.new) > 50:
            activities = check_for_duplicates(activities)
            db.session.commit()
            activities = []
    db.session.add_all(activities)
    activities = check_for_duplicates(activities)
    db.session.commit()

    resource.version = parse.document_metadata(resource.document)

    #add any identifiers that are no longer present to deleted_activity table
    diff = current_identifiers - new_identifiers
    now = datetime.datetime.utcnow()
    deleted = [
        DeletedActivity(iati_identifier=deleted_activity, deletion_date=now)
        for deleted_activity in diff
    ]
    if deleted:
        db.session.add_all(deleted)

    #remove any new identifiers from the deleted_activity table
    if new_identifiers:
        db.session.query(DeletedActivity)\
                .filter(DeletedActivity.iati_identifier.in_(new_identifiers))\
                .delete(synchronize_session="fetch")

    log.info("Parsed %d activities from %s", len(resource.activities),
             resource.url)
    resource.last_parsed = now
    return resource  #, new_identifiers
Beispiel #3
0
def delete_dataset(dataset):
    deleted_dataset = db.session.query(Dataset). \
        filter(Dataset.name == dataset)

    activities_to_delete = db.session.query(Activity). \
        filter(Activity.resource_url == Resource.url). \
        filter(Resource.dataset_id == dataset)

    now = datetime.datetime.now()
    for a in activities_to_delete:
        db.session.merge(
            DeletedActivity(iati_identifier=a.iati_identifier,
                            deletion_date=now))
    db.session.commit()
    return deleted_dataset.delete(synchronize_session='fetch')
Beispiel #4
0
def delete_datasets(datasets):
    deleted_datasets = db.session.query(Dataset).filter(
        Dataset.name.in_(datasets))

    activities_to_delete = db.session.query(Activity).\
                                filter(Activity.resource_url==Resource.url).\
                                filter(Resource.dataset_id.in_(datasets))

    now = datetime.datetime.now()
    deleted_activities = [
        DeletedActivity(iati_identifier=a.iati_identifier, deletion_date=now)
        for a in activities_to_delete
    ]
    db.session.add_all(deleted_activities)
    db.session.commit()
    deleted = deleted_datasets.delete(synchronize_session='fetch')
    log.info("Deleted {0} datasets".format(deleted))
    return deleted
Beispiel #5
0
def delete_datasets(datasets):

    deleted_datasets = db.session.query(Dataset).filter(
        Dataset.name.in_(datasets))

    activities_to_delete = db.session.query(Activity). \
        filter(Activity.resource_url == Resource.url). \
        filter(Resource.dataset_id.in_(datasets))

    now = datetime.datetime.now()
    deleted_activities = []
    for i in range(0, activities_to_delete.count(), 100):
        # Slice the query to make sure it doesn't use up all the memory
        for a in activities_to_delete.slice(i, i + 100):
            deleted_activities.append(
                DeletedActivity(iati_identifier=a.iati_identifier,
                                deletion_date=now))
    db.session.add_all(deleted_activities)
    db.session.commit()
    deleted = deleted_datasets.delete(synchronize_session='fetch')
    log.info("Deleted {0} datasets".format(deleted))
    return deleted
Beispiel #6
0
def parse_resource(resource):
    db.session.add(resource)
    current = Activity.query.filter_by(resource_url=resource.url)
    current_identifiers = set([i.iati_identifier for i in current.all()])

    # obtains the iati-identifier, last-updated datetime, and a hash of the existing xml associated with
    # every activity associated with the current url.
    old_xml = dict([
        (i[0], (i[1], hash(i[2].encode('utf-8')))) for i in db.session.query(
            Activity.iati_identifier, Activity.last_change_datetime,
            Activity.raw_xml).filter_by(resource_url=resource.url)
    ])

    db.session.query(Activity).filter_by(resource_url=resource.url).delete()
    new_identifiers = set()
    parse_activity(new_identifiers, old_xml, resource)

    resource.version = parse.document_metadata(resource.document)

    # add any identifiers that are no longer present to deleted_activity table
    diff = current_identifiers - new_identifiers
    now = datetime.datetime.utcnow()
    deleted = [
        DeletedActivity(iati_identifier=deleted_activity, deletion_date=now)
        for deleted_activity in diff
    ]
    if deleted:
        db.session.add_all(deleted)

    # remove any new identifiers from the deleted_activity table
    if new_identifiers:
        db.session.query(DeletedActivity) \
            .filter(DeletedActivity.iati_identifier.in_(new_identifiers)) \
            .delete(synchronize_session="fetch")

    log.info("Parsed %d activities from %s", resource.activities.count(),
             resource.url)
    resource.last_parsed = now
    return resource  # , new_identifiers