def parse_activity(new_identifiers, old_xml, resource): for activity in parse.document(resource.document, resource): activity.resource = resource if activity.iati_identifier not in new_identifiers: new_identifiers.add(activity.iati_identifier) try: if hash(activity.raw_xml) == old_xml[activity.iati_identifier][1]: activity.last_change_datetime = old_xml[activity.iati_identifier][0] else: activity.last_change_datetime = datetime.datetime.now() except KeyError: activity.last_change_datetime = datetime.datetime.now() db.session.add(activity) check_for_duplicates([activity]) else: parse.log.warn( _("Duplicate identifier {0} in same resource document".format( activity.iati_identifier), logger='activity_importer', dataset=resource.dataset_id, resource=resource.url), exc_info='' ) db.session.flush() db.session.commit()
def parse_activity(new_identifiers, old_xml, resource): for activity in parse.document(resource.document, resource): activity.resource = resource if activity.iati_identifier not in new_identifiers: new_identifiers.add(activity.iati_identifier) try: if hash(activity.raw_xml) == old_xml[ activity.iati_identifier][1]: activity.last_change_datetime = old_xml[ activity.iati_identifier][0] else: activity.last_change_datetime = datetime.datetime.now() except KeyError: activity.last_change_datetime = datetime.datetime.now() db.session.add(activity) check_for_duplicates([activity]) else: parse.log.warn(_( "Duplicate identifier {0} in same resource document".format( activity.iati_identifier), logger='activity_importer', dataset=resource.dataset_id, resource=resource.url), exc_info='') db.session.flush() db.session.commit()
def test_policy_markers(self): activities = [ a for a in parse.document(fixture_filename("CD.xml")) ] self.assertEquals(8, len(activities[1].policy_markers)) self.assertEquals(cl.PolicyMarker.gender_equality, activities[1].policy_markers[0].code) self.assertEquals(cl.PolicyMarker.aid_to_environment, activities[1].policy_markers[1].code) self.assertEquals(cl.PolicyMarker.participatory_developmentgood_governance, activities[1].policy_markers[2].code) self.assertEquals(cl.PolicyMarker.trade_development, activities[1].policy_markers[3].code)
def parse_resource(resource): db.session.add(resource) now = datetime.datetime.utcnow() current = Activity.query.filter_by(resource_url=resource.url) current_identifiers = set([i.iati_identifier for i in current.all()]) old_xml = dict([(i[0], (i[1], hash(i[2]))) for i in db.session.query( Activity.iati_identifier, Activity.last_change_datetime, Activity.raw_xml).filter_by(resource_url=resource.url)]) db.session.query(Activity).filter_by(resource_url=resource.url).delete() new_identifiers = set() activities = [] for activity in parse.document(resource.document, resource): activity.resource = resource new_identifiers.add(activity.iati_identifier) try: if hash(activity.raw_xml) == old_xml[activity.iati_identifier][1]: activity.last_change_datetime = old_xml[ activity.iati_identifier][0] else: activity.last_change_datetime = datetime.datetime.now() except KeyError: activity.last_change_datetime = datetime.datetime.now() activities.append(activity) db.session.add(activity) if len(db.session.new) > 50: activities = check_for_duplicates(activities) db.session.commit() activities = [] db.session.add_all(activities) activities = check_for_duplicates(activities) db.session.commit() resource.version = parse.document_metadata(resource.document) #add any identifiers that are no longer present to deleted_activity table diff = current_identifiers - new_identifiers now = datetime.datetime.utcnow() deleted = [ DeletedActivity(iati_identifier=deleted_activity, deletion_date=now) for deleted_activity in diff ] if deleted: db.session.add_all(deleted) #remove any new identifiers from the deleted_activity table if new_identifiers: db.session.query(DeletedActivity)\ .filter(DeletedActivity.iati_identifier.in_(new_identifiers))\ .delete(synchronize_session="fetch") log.info("Parsed %d activities from %s", len(resource.activities), resource.url) resource.last_parsed = now return resource #, new_identifiers
def parse_resource(resource): db.session.add(resource) Activity.query.filter_by(resource_url=resource.url).delete() resource.activities = list(parse.document(resource.document, resource)) log.info( "Parsed %d activities from %s", len(resource.activities), resource.url) resource.last_parsed = datetime.datetime.utcnow() return resource
def test_dates(self): activities = list(parse.document(fixture_filename("CD.xml"))) self.assertEquals(datetime.date(2004, 1, 1), activities[0].start_planned) self.assertEquals(datetime.date(2004, 1, 1), activities[0].start_actual) self.assertEquals(datetime.date(2010, 12, 31), activities[0].end_planned) self.assertEquals(datetime.date(2010, 12, 31), activities[0].end_actual)
def test_missing_id(self): # missing activity id means don't parse activities = parse.document(ET.XML( u''' <iati-activities> <iati-activity default-currency="GBP" xml:lang="en"> <reporting-org ref="GB-2" type="15">CDC Group plc</reporting-org> <activity-status code="2">Implementation</activity-status> </iati-activity> </iati-activities> ''')) self.assertEquals(0, len(list(activities)))
def test_missing_reporting_org(self): # missing reporting org should still parse activities = list(parse.document(ET.XML( u''' <iati-activities> <iati-activity default-currency="GBP" xml:lang="en"> <iati-identifier>AAA-AA</iati-identifier> <activity-status code="2">Implementation</activity-status> </iati-activity> </iati-activities> '''))) self.assertEquals(1, len(activities)) self.assertEquals(u"AAA-AA", activities[0].iati_identifier)
def test_missing_reporting_org(self): # missing reporting org should still parse activities = list( parse.document( ET.XML(u''' <iati-activities> <iati-activity default-currency="GBP" xml:lang="en"> <iati-identifier>AAA-AA</iati-identifier> <activity-status code="2">Implementation</activity-status> </iati-activity> </iati-activities> '''))) self.assertEquals(1, len(activities)) self.assertEquals(u"AAA-AA", activities[0].iati_identifier)
def parse_file(filenames, verbose=False, fail_xml=False, fail_spec=False): for filename in filenames: if verbose: print "Parsing", filename try: db.session.add_all(parse.document(filename)) db.session.commit() except parse.ParserError, exc: logging.error("Could not parse file %r", filename) db.session.rollback() if isinstance(exc, parse.XMLError) and fail_xml: raise if isinstance(exc, parse.SpecError) and fail_spec: raise
def test_save_complex_example(self): acts = parse.document( fixture("complex_example_dfid.xml", encoding=None)) db.session.add_all(acts) db.session.commit()
def test_default_language(self): activities = [ a for a in parse.document(fixture_filename("default_currency.xml")) ] self.assertEquals(cl.Language.english, activities[0].default_language)
def test_default_tied_status(self): activities = [ a for a in parse.document(fixture_filename("CD.xml")) ] self.assertEquals(cl.TiedStatus.untied, activities[1].default_tied_status)
def test_default_flow_type(self): activities = [ a for a in parse.document(fixture_filename("CD.xml")) ] self.assertEquals(cl.FlowType.oda, activities[1].default_flow_type)
def test_collaboration_type(self): activities = [ a for a in parse.document(fixture_filename("CD.xml")) ] self.assertEquals(cl.CollaborationType.bilateral, activities[1].collaboration_type)
def test_related_activity(self): activities = [ a for a in parse.document(fixture_filename("CD.xml")) ] self.assertEquals(4, len(activities[0].related_activities)) self.assertEquals("GB-1-105838-101", activities[0].related_activities[0].ref)
def test_sector_percentage_count(self): act = next(parse.document( fixture("complex_example_dfid.xml", encoding=None))) self.assertEquals(5, len(act.sector_percentages))
def test_default_hierarchy(self): activities = [ a for a in parse.document(fixture_filename("default_currency.xml")) ] self.assertEquals(cl.RelatedActivityType.parent, activities[0].hierarchy)
def test_default_aid_type(self): activities = [ a for a in parse.document(fixture_filename("CD.xml")) ] self.assertEquals(cl.AidType.projecttype_interventions, activities[1].default_aid_type)
def test_default_finance_type(self): activities = [ a for a in parse.document(fixture_filename("CD.xml")) ] self.assertEquals(cl.FinanceType.aid_grant_excluding_debt_reorganisation, activities[1].default_finance_type)
def test_different_roles(self): activities = parse.document(fixture_filename("same_orgs_different_roles.xml")) db.session.add_all(activities) db.session.commit()
def test_no_start_actual(self): activities = parse.document(fixture_filename("missing_dates.xml")) act = {a.iati_identifier:a for a in activities} self.assertEquals(None, act[u"GB-CHC-272465-680"].start_actual)
def test_save_repeated_participation(self): activities = parse.document(fixture_filename("repeated_participation.xml")) db.session.add_all(activities) db.session.commit()
def test_activity_status(self): activities = [ a for a in parse.document(fixture_filename("default_currency.xml")) ] self.assertEquals(cl.ActivityStatus.implementation, activities[0].activity_status)
def test_big_values(self): activities = parse.document(fixture_filename("big_value.xml")) db.session.add_all(activities) db.session.commit()
def setUp(self): super(TestParse2xxActivity, self).setUp() self.activities = list(parse.document(fixture_filename("2.01-example-annotated.xml"))) self.act = self.activities[0]
def test_save_parsed_201(self): activities = parse.document(fixture_filename("2.01-example-annotated.xml")) db.session.add_all(activities) db.session.commit()
def load_fix(fix_name): activities = parse.document(fixture_filename(fix_name)) db.session.add_all(activities) db.session.commit()
def parse_resource(resource): db.session.add(resource) now = datetime.datetime.utcnow() current = Activity.query.filter_by(resource_url=resource.url) current_identifiers = set([ i.iati_identifier for i in current.all() ]) old_xml = dict([ (i[0], (i[1], hash(i[2]))) for i in db.session.query( Activity.iati_identifier, Activity.last_change_datetime, Activity.raw_xml).filter_by(resource_url=resource.url) ]) db.session.query(Activity).filter_by(resource_url=resource.url).delete() new_identifiers = set() activities = [] for activity in parse.document(resource.document, resource): activity.resource = resource if activity.iati_identifier not in new_identifiers: new_identifiers.add(activity.iati_identifier) try: if hash(activity.raw_xml) == old_xml[activity.iati_identifier][1]: activity.last_change_datetime = old_xml[activity.iati_identifier][0] else: activity.last_change_datetime = datetime.datetime.now() except KeyError: activity.last_change_datetime = datetime.datetime.now() activities.append(activity) db.session.add(activity) if len(db.session.new) > 50: activities = check_for_duplicates(activities) db.session.commit() activities = [] else: parse.log.warn( _("Duplicate identifier {0} in same resource document".format( activity.iati_identifier), logger='activity_importer', dataset=resource.dataset_id, resource=resource.url), exc_info='' ) db.session.add_all(activities) activities = check_for_duplicates(activities) db.session.commit() resource.version = parse.document_metadata(resource.document) #add any identifiers that are no longer present to deleted_activity table diff = current_identifiers - new_identifiers now = datetime.datetime.utcnow() deleted = [ DeletedActivity(iati_identifier=deleted_activity, deletion_date=now) for deleted_activity in diff ] if deleted: db.session.add_all(deleted) #remove any new identifiers from the deleted_activity table if new_identifiers: db.session.query(DeletedActivity)\ .filter(DeletedActivity.iati_identifier.in_(new_identifiers))\ .delete(synchronize_session="fetch") log.info( "Parsed %d activities from %s", len(resource.activities), resource.url) resource.last_parsed = now return resource#, new_identifiers