def __init__(self): self.undp_export = settings.UNDP_EXPORT self.projects = Projects() self.projectsummaries = ProjectSummaries() self.outputs = Outputs() self.subnationals = Subnationals() self.units = Units() self.crsindex = CrsIndex() self.donorindex = DonorIndex() self.countrydonorindex = CountryDonorIndex() self.topdonor_gross = TopDonorGrossIndex() self.topdonor_local = TopDonorLocalIndex() self.donor_ids = DonorIDs() self.region_index = RegionIndex() self.core_donors = CoreDonors() self.operating_unit_index = OperatingUnitIndex() self.api_path = settings.API_PATH self._years = set() self.geo = None # Adding 2010 because the xmls files are starting from 2011 but the legacy site expect to see 2010 self.years = 2010 self.country_donors = None
class ProjectsController(Controller): """Main Process class that includes all the functions needed for processing UNDP xml data Main methods: run - Runs the whole class and generate everything """ def __init__(self): self.undp_export = settings.UNDP_EXPORT self.projects = Projects() self.projectsummaries = ProjectSummaries() self.outputs = Outputs() self.subnationals = Subnationals() self.units = Units() self.crsindex = CrsIndex() self.donorindex = DonorIndex() self.countrydonorindex = CountryDonorIndex() self.topdonor_gross = TopDonorGrossIndex() self.topdonor_local = TopDonorLocalIndex() self.donor_ids = DonorIDs() self.region_index = RegionIndex() self.core_donors = CoreDonors() self.operating_unit_index = OperatingUnitIndex() self.api_path = settings.API_PATH self._years = set() self.geo = None # Adding 2010 because the xmls files are starting from 2011 but the legacy site expect to see 2010 self.years = 2010 self.country_donors = None @property def years(self): return self._years @years.setter def years(self, value): self._years.add(value) def generate(self): """ Main method. Execute necessary functions and generate json files """ for files in reversed( sorted(self.get_filenames(settings.IATI_XML_ANNUAL))): self._prepare(files, 'iati-activity', 'projects') self._prepare(files, 'iati-activity', 'outputs') # Generating useful info for console counter = 0 for i in self.outputs.collection.values(): counter += len(i) self.log('Total outputs processed: %s' % counter) self.log('Total projects processed: %s' % len(self.projects.pks)) self.log('Total Donor Index processed: %s' % len(self.donorindex.pks)) self.log('Total Country Donor Index processed: %s' % len(self.countrydonorindex.pks)) # Save Project Json files self.projects.save_json(self.outputs, self.subnationals, self.api_path) # Save Unit Json files self.units.save_json(self.subnationals, self.api_path) # Generate Core Donors self._populate_core_donors() self.core_donors.save_json(self.api_path, 'core-donors.json') # Save Summary files self._generate_project_summary(self.projects) self.projectsummaries.save_json(self.api_path) # Save Other Jsons self.crsindex.save_json(self.api_path, 'crs-index.json') self.donorindex.save_json(self.api_path, 'donor-index.json') self.countrydonorindex.save_json(self.api_path, 'donor-country-index.json') self._generate_year_index() # Top Donor Gross Index self._populate_top_donor_gross_index() self.topdonor_gross.save_json(self.api_path, 'top-donor-gross-index.json') # Top Donor Local Index self._populate_top_donor_local_index() self.topdonor_local.save_json(self.api_path, 'top-donor-local-index.json') # Region Index self._populate_region_index() self.region_index.save_json(self.api_path, 'region-index.json') # Focus Area Index focus = FocusAreaIndex() focus.save_json(self.api_path, 'focus-area-index.json') # Generating HDI self._generate_hdi() # Save Operating Unit Index self._populate_operating_unit_index() self.operating_unit_index.save_json(self.api_path, 'operating-unit-index.json') def _prepare(self, xml_file, tag, op_type): """Prepares and executes other methods to prepare the data. Arguments: xml_file - full path to the xml file tag -- one choice is available: iati-activity op_type -- only two choices available: outputs - projects """ # Get IATI activities XML iter_obj = iter(etree.iterparse(xml_file, tag=tag)) # Extract year try: year = int(self.extract_years([xml_file])[0]) self.years = year except ValueError: return func = getattr(self, '_populate_%s' % op_type) func(iter_obj, year) def _populate_operating_unit_index(self): current_year = sorted(list(self.years), reverse=True)[0] country_isos = self.get_and_sort( '%s/country_iso.csv' % settings.UNDP_EXPORT, 'iso3') units = self.get_and_sort(self.undp_export + '/report_units.csv', 'operating_unit') iso3 = dict([(i['iso3'].decode('utf-8').encode('ascii', 'ignore'), i['iso_num'].decode('utf-8').encode('ascii', 'ignore')) for i in country_isos]) units_index = dict([(i['operating_unit'], i['fund_type']) for i in units]) for country in self.geo: if country['iso3'] in self.units.pks: obj = OperatingUnit() obj.id.value = country['iso3'] obj.fund_type.value = units_index[obj.id.value] obj.name.value = country[obj.name.key] if country[obj.lat.key] != '': obj.lat.value = country[obj.lat.key] obj.lon.value = country[obj.lon.key] if obj.id.value in iso3: obj.iso_num.value = iso3[obj.id.value] # Looping through project summaries to get total budgets funding_source = set() for project in self.projectsummaries.collection[current_year]: if project.operating_unit.value == obj.id.value: obj.project_count.value += 1 obj.budget_sum.value += round(project.budget.value, 2) obj.expenditure_sum.value += round( project.expenditure.value, 2) for item in project.donors.value: funding_source.add(item) project_obj = self.projects.collection[ project.id.value] obj.email.value = project_obj.operating_unit_email.value obj.web.value = project_obj.operating_unit_website.value obj.funding_sources_count.value = len(funding_source) self.operating_unit_index.add(obj.id.value, obj) def _populate_core_donors(self): cores = self.get_and_sort(settings.DONOR_DATA + '/core_fund.csv', 'Donor') for core in cores: obj = CoreDonor() obj.donor_id.value = core['Donor'] obj.description.value = core['Donor Desc'] obj.short_description.value = core['Donor Level 3'] # Adding extra zeros to the begining of donor ids to make them 5 characters additional_zeros = 5 - len(obj.donor_id.value) obj.donor_id.value = '%s%s' % ( ('0' * additional_zeros), obj.donor_id.value) self.core_donors.add(obj.donor_id.value, obj) def _populate_region_index(self): units = self.get_and_sort(self.undp_export + '/report_units.csv', 'bureau') choices = ['PAPP', 'RBA', 'RBAP', 'RBAS', 'RBEC', 'RBLAC'] for unit in units: if (unit['bureau'] in choices and unit['hq_co'] == 'HQ') or unit['bureau'] == 'PAPP': if unit['ou_descr'] != 'Regional Center - Addis Ababa': obj = Region() obj.name.value = unit['ou_descr'] obj.id.value = unit['bureau'] try: self.region_index.add(obj.id.value, obj) except ObjectExists: pass obj = Region() obj.name.value = 'Global' obj.id.value = 'global' self.region_index.add(obj.id.value, obj) def _populate_top_donor_local_index(self): local = self.get_and_sort(self.undp_export + '/donor_local.csv', 'donor') for item in local: obj = TopDonorLocal() obj.name.value = item[obj.name.key] obj.country.value = item[obj.country.key] obj.amount.value = item[obj.amount.key] obj.donor_id.value = self.donor_ids.collection.get( item['donor'], None) self.topdonor_local.add(obj.donor_id.value, obj) def _populate_top_donor_gross_index(self): gross = self.get_and_sort(self.undp_export + '/donor_gross.csv', 'donor') for item in gross: obj = TopDonor() obj.name.value = item[obj.name.key] obj.country.value = item[obj.country.key] obj.regular.value = item[obj.regular.key] obj.other.value = item[obj.other.key] obj.total.value = item[obj.total.key] obj.donor_id.value = self.donor_ids.collection.get( item['donor'], None) self.topdonor_gross.add(obj.donor_id.value, obj) def _generate_project_summary(self, projects): donors = self.get_and_sort(self.undp_export + '/report_donors.csv', 'awardID') report_donors = ReportDonors() # Create an index of donors based on awardID for item in donors: report_donors.add_update_list(item['awardID'], item) try: self.donor_ids.add(item['donor_type_lvl3_descr'], item['donorID']) except ObjectExists: pass regionsList = ['PAPP', 'RBA', 'RBAP', 'RBAS', 'RBEC', 'RBLAC'] # Looping through years of projects counter = 0 for project in projects.collection.values(): for year in project.fiscal_year.value: # Should create a new model instance for each year of the project as they are stored in separate # summary files obj = ProjectSummary() # set region if project.region_id.value not in regionsList: obj.region.value = 'global' else: obj.region.value = project.region_id.value obj.operating_unit.value = project.operating_unit_id.value obj.name.value = project.project_title.value obj.id.value = project.project_id.value obj.fiscal_year.value = year # Fill out fields from report donors list try: country = defaultdict(lambda: defaultdict(float)) for item in report_donors.collection[ project.project_id.value]: if int(item['fiscal_year']) == int( year) and item['donorID']: country[item['donorID']]['budget'] += float( item['budget']) country[item['donorID']]['expenditure'] += float( item['expenditure']) country[item['donorID']]['type'] = item[ 'donor_type_lvl1'].replace(" ", "") if item['donor_type_lvl1'] == 'PROG CTY' or item[ 'donor_type_lvl1'] == 'NON_PROG CTY': country[item['donorID']]['name'] = item[ 'donor_type_lvl3'].replace(" ", "") elif item['donor_type_lvl1'] == 'MULTI_AGY': country[item['donorID']]['name'] = item[ 'donor_type_lvl1'].replace(" ", "") else: country[item['donorID']]['name'] = 'OTH' # country[item['donorID']]['name'] = item['donor_type_lvl3'] if item['donorID'] == '00012': obj.core.value = True for key, value in country.iteritems(): obj.donor_countries.value.append(value['name']) obj.donor_budget.value.append(value['budget']) obj.donor_expend.value.append(value['expenditure']) obj.donor_types.value.append(value['type']) obj.donors.value.append(key) except KeyError: # There are few projects ids that are not appearing the donor list. this catch resolve them pass obj.expenditure.value = sum(obj.donor_expend.value) obj.budget.value = sum(obj.donor_budget.value) # Get other information from outputs for output in project.outputs.value: obj.crs.value.add(output['crs']) obj.focus_area.value.add(output['focus_area']) self.projectsummaries.add_update_list(year, obj) counter += 1 self.log('%s summary projects processed' % counter) def _generate_year_index(self): """ Generates year-index.js """ writeout = 'var FISCALYEARS = %s' % sorted(map(str, list(self.years)), reverse=True) f_out = open('%s/year-index.js' % self.api_path, 'wb') f_out.writelines(writeout) f_out.close() self.log('Year Index Generated') def _populate_units(self, project_obj): """ Fill Units collections """ unit_project = UnitProject() unit_project.title.value = project_obj.project_title.value unit_project.id.value = project_obj.project_id.value if project_obj.operating_unit_id.value in self.units.pks: self.units.collection[ project_obj.operating_unit_id.value].projects.value.append( unit_project.to_dict()) else: unit = Unit() unit.op_unit.value = project_obj.operating_unit_id.value unit.projects.value.append(unit_project.to_dict()) self.units.add(project_obj.operating_unit_id.value, unit) def _populate_projects(self, iter_obj, yr): """Loop through the iter_obj to and sort/clean data based project_id Produced a list of dictionaries. Sample: {'end': '2012-12-31', 'operating_unit_email': '*****@*****.**', 'inst_id': '', 'operating_unit': 'Lithuania, Republic of', 'iati_op_id': 'LT', 'inst_descr': '', 'start': '2005-01-01', 'operating_unit_id': 'LTU', 'operating_unit_website': 'http://www.undp.lt/', 'project_id': '00038726', 'inst_type_id': '', 'document_name': u'http://www.undp.org/content/dam/undp/documents/projects/LTU/00038726/RC fund.pdf'} Arguments: iter_obj - and iteratble etree object """ counter = 0 # Get sorted units report_units = self.get_and_sort( self.undp_export + '/report_units.csv', 'operating_unit') # sorting table for documents by importancy docs_sort = [ 'A02', 'A03', 'A04', 'A05', 'A01', 'A07', 'A08', 'A09', 'A06', 'A11', 'A10' ] # Loop through each IATI activity in the XML for event, p in iter_obj: # IATI hierarchy used to determine if output or input1 hierarchy = p.attrib['hierarchy'] # Check for projects if hierarchy == '1': obj = Project() obj.project_id.value = self._grab_award_id(p[1].text) # Check if the project_id is unique if obj.project_id.value in self.projects.pks: continue obj.fiscal_year.value.append(yr) obj.project_title.value = p.find( obj.project_title.xml_key).text.lower() obj.project_descr.value = p.find( obj.project_descr.xml_key).text documents = p.findall('./document-link') if documents: names = [] links = [] format = [] places = [] for doc in documents: try: links.append( urllib2.unquote(doc.get('url')).encode( 'utf-8').decode('utf-8')) except UnicodeDecodeError: links.append( urllib2.unquote( doc.get('url')).decode('utf-8')) #links.append(doc.get('url')) if 'application/' in doc.get('format'): ft = mimetypes.guess_extension( doc.get('format'), False) if ft is None: format.append('') else: format.append(ft.lstrip('.')) else: format.append('') for d in doc.iterchildren(tag=obj.document_name.key): names.append(d.text) # default place is last place = 100 for t in doc.iterchildren(tag='category'): try: tp = docs_sort.index(t.get('code')) except ValueError: tp = 100 if (tp < place): place = tp places.append(place) obj.document_name.value.extend( [names, links, format, places]) # Find start and end dates obj.start.value = p.find(obj.start.xml_key).text obj.end.value = p.find(obj.end.xml_key).text contact = p.findall('./contact-info') obj.operating_unit_email.value = [ e.text for email in contact for e in email.iterchildren( tag=obj.operating_unit_email.key) ][0] # Find operating_unit # If recipient country didn't exist look for recipient region try: obj.iati_op_id.value = (p.find( obj.iati_op_id.xml_key).attrib.get('code')) obj.operating_unit.value = p.find( obj.operating_unit.xml_key).text for r in report_units: if (obj.iati_op_id.value == r['iati_operating_unit'] or obj.iati_op_id.value == r['operating_unit']): obj.operating_unit_id.value = r['operating_unit'] obj.region_id.value = r[obj.region_id.key] except: region_unit = p.findall("./recipient-region") for ru in region_unit: for r in report_units: if type(ru.text) == type( r['ou_descr'] ) and ru.text == r['ou_descr']: obj.operating_unit_id.value = r[ 'operating_unit'] obj.operating_unit.value = r['ou_descr'] obj.iati_op_id.value = '998' # find contact info try: for email in contact: for e in email.iterchildren( tag=obj.operating_unit_email.key): obj.operating_unit_email.value = e.text obj.operating_unit_website.value = p.find( obj.operating_unit_website.xml_key).text except: pass # Check for implementing organization try: inst = p.find("./participating-org[@role='Implementing']") obj.inst_id.value = inst.attrib.get(obj.inst_id.key) obj.inst_type_id.value = inst.attrib.get( obj.inst_type_id.key) obj.inst_descr.value = inst.text except: pass # Populate the Unit Collection self._populate_units(obj) counter += 1 self.log('Processing: %s' % counter, True) self.projects.add(obj.project_id.value, obj) self.log('%s - Project Annuals: %s rows processed' % (yr, counter)) def _populate_outputs(self, iter_obj, yr): counter = 0 # Get sorted country donoros sorted_donors = self.get_and_sort( self.undp_export + '/country_donors_updated.csv', 'id') # Get South-South projects ss_list = self.get_and_list( self.undp_export + '/SSCprojects_IDlist.csv', 'projectid') for event, o in iter_obj: hierarchy = o.attrib['hierarchy'] if hierarchy == '2': obj = Output() crs = Crs() obj.output_id.value = self._grab_award_id(o[1].text) # Check if the project_id is unique if obj.output_id.value in self.outputs.output_ids: continue obj.output_title.value = o.find(obj.output_title.xml_key).text obj.output_descr.value = o.find(obj.output_descr.xml_key).text try: obj.gender_id.value = o.find( obj.gender_descr.xml_key).attrib.get(obj.gender_id.key) obj.gender_descr.value = o.find( obj.gender_descr.xml_key).text except: obj.gender_id.value = "0" obj.gender_descr.value = "None" try: obj.crs.value = o.find(obj.crs_descr.xml_key).get( obj.crs.key) crs.name.value = obj.crs.value except AttributeError: pass try: obj.crs_descr.value = o.find(obj.crs_descr.xml_key).text crs.id.value = obj.crs_descr.value except AttributeError: pass try: self.crsindex.add(crs.id.value, crs) except ObjectExists: pass try: obj.award_id.value = self._grab_award_id( o.find(obj.award_id.xml_key).get('ref')) except: obj.award_id.value = self._grab_award_id( o.find("./related-activity[@type='2']").get('ref')) try: if obj.award_id.value in ss_list: obj.focus_area.value = '8' obj.focus_area_descr.value = 'South-South' else: obj.focus_area.value = o.find( obj.focus_area_descr.xml_key).get( obj.focus_area.key) if not o.find(obj.focus_area_descr.xml_key).text: obj.focus_area_descr.value = "-" else: obj.focus_area_descr.value = o.find( obj.focus_area_descr.xml_key).text except: obj.focus_area.value = "-" obj.focus_area_descr.value = "-" for donor in o.findall("./participating-org[@role='Funding']"): ref = donor.get('ref') obj.donor_id.value.add(ref) if ref == '00012': obj.donor_name.value.append('Voluntary Contributions') else: obj.donor_name.value.append(donor.text) for d in sorted_donors: # Check IDs from the CSV against the cntry_donors_sort. # This provides funding country names not in XML if d['id'] == ref: # for outputs obj.donor_short.value.append( d[obj.donor_short.key]) # Find budget information to later append to projectFY array budget_expend = defaultdict(lambda: defaultdict(float)) obj.budget.temp = o.findall(obj.budget.xml_key) for budget in obj.budget.temp: for b in budget.iterchildren(tag='value'): year = int(b.get('value-date').split('-', 3)[0]) budget_expend[year]['budget'] = float(b.text) # Use transaction data to get expenditure for tx in o.findall('transaction'): for expen in tx.findall(obj.expenditure.xml_key): for sib in expen.itersiblings(): if sib.tag == 'value': year = int( sib.get('value-date').split('-', 3)[0]) budget_expend[year]['expenditure'] = float( sib.text) for key, value in budget_expend.iteritems(): obj.fiscal_year.value.append(key) obj.budget.value.append(value['budget']) obj.expenditure.value.append(value['expenditure']) # Run subnationals locations = o.findall('location') if locations: self._populate_subnationals(obj.award_id.value, obj, o, locations) # Populate Donor Index self._populate_donor_index(o) counter += 1 self.log('Processing: %s' % counter, True) self.outputs.add_update_list(obj.award_id.value, obj) self.log('%s - output Annuals: %s rows processed' % (yr, counter)) def _populate_subnationals(self, project_id, output_obj, node, locations): """ Populate subnational object. This is dependant on _populate_outputs and cannot be executed separately project_id - the related project_id output_id - output model object node - output xml object Returns: Populatess subnationals property """ counter = 0 for location in locations: obj = Subnational() counter += 1 obj.awardID.value = project_id obj.outputID.value = output_obj.output_id.value obj.output_locID.value = "%s-%d" % (obj.outputID.value, counter) # Focus areas obj.focus_area.value = output_obj.focus_area.value obj.focus_area_descr.value = output_obj.focus_area_descr.value for item in location.iterchildren(): if item.tag == 'coordinates': obj.lat.value = item.get(obj.lat.key) obj.lon.value = item.get(obj.lon.key) obj.precision.value = item.get(obj.precision.key) if item.tag == 'name': obj.name.value = item.text if item.tag == 'location-type': obj.type.value = item.get(obj.type.key) # IATI 1.04 if item.tag == 'point': pos = item.getchildren() lat_lon = pos[0].text.split(' ') obj.lat.value = lat_lon[0] obj.lon.value = lat_lon[1] # IATI 1.04 if item.tag == 'exactness': obj.precision.value = item.get('code') # IATI 1.04 if item.tag == 'feature-designation': obj.type.value = item.get(obj.type.key) self.subnationals.add_update_list(project_id, obj) def _populate_donor_index(self, output_obj): """ Populates both donor-index and donor-country-index """ if not self.country_donors: self.country_donors = self.get_and_sort( self.undp_export + '/country_donors_updated.csv', 'id') for donor in output_obj.findall( "./participating-org[@role='Funding']"): obj = Donor() country_obj = CountryDonor() ref = donor.get(obj.id.key) if ref: for item in self.country_donors: if ref == item['id']: # Skip the loop if the ref already is added if ref not in self.donorindex.pks: obj.id.value = ref obj.name.value = donor.text or "Unknown" if item['donor_type_lvl1'] == 'PROG CTY' or item[ 'donor_type_lvl1'] == 'NON_PROG CTY': obj.country.value = item[ 'donor_type_lvl3'].replace(" ", "") elif item['donor_type_lvl1'] == 'MULTI_AGY': obj.country.value = item[ 'donor_type_lvl1'].replace(" ", "") else: obj.country.value = 'OTH' self.donorindex.add(obj.id.value, obj) if item['donor_type_lvl3'] not in self.countrydonorindex.pks: country_obj.id.value = item['donor_type_lvl3'] country_obj.name.value = item[ 'donor_type_lvl3_descr'] self.countrydonorindex.add(item['donor_type_lvl3'], country_obj) def _search_list_dict(_list, key, search): result = [item for item in _list if item[key] == search] if len(result) > 0: return result else: return False def _generate_hdi(self): hdi = self.get_and_sort('%s/hdi-csv-clean.csv' % settings.HDI, 'hdi2013') self.geo = self.get_and_sort( '%s/country-centroids.csv' % settings.PROCESS_FILES, 'iso3') # Add current year to the years array years = [ 1980, 1985, 1990, 1995, 2000, 2005, 2006, 2007, 2008, 2011, 2012, 2013 ] # Set current year to the latest year of HDI Data current_year = 2013 row_count = 0 rank = 0 hdi_index = [] hdi_dict = {} for val in iter(hdi): row_count = row_count + 1 hdi_total = [] hdi_health = [] hdi_ed = [] hdi_inc = [] change = [] change_year = {} for y in years: if val['hdi%d' % y] != '': if val['ed%d' % y] != "" and val[ 'health%d' % y] != "" and val['income%d' % y] != "": hdi_total.append( [y, round(float(val['hdi%d' % y]), 3)]) hdi_health.append( [y, round(float(val['health%d' % y]), 3)]) hdi_ed.append([y, round(float(val['ed%d' % y]), 3)]) hdi_inc.append( [y, round(float(val['income%d' % y]), 3)]) if y != current_year: change_year = round( float(val['hdi%d' % current_year]), 3) - round( float(val['hdi%d' % y]), 3) if len(change) == 0: change.append(change_year) if len(change) == 0: change.append("") for ctry in self.geo: if ctry['name'] == val['country']: if val['hdi%d' % current_year] == "": g = { "id": ctry['iso3'], "name": val['country'], "hdi": "", "health": "", "income": "", "education": "", "change": change[0], "rank": "n.a." } else: if ctry['iso3'].rfind("A-", 0, 2) == 0: g = { "id": ctry['iso3'], "name": val['country'], "hdi": hdi_total, "health": hdi_health, "income": hdi_inc, "education": hdi_ed, "change": change[0], "rank": "n.a." } else: rank = rank + 1 g = { "id": ctry['iso3'], "name": val['country'], "hdi": hdi_total, "health": hdi_health, "income": hdi_inc, "education": hdi_ed, "change": change[0], "rank": rank } hdi_index.append(g) uid = ctry['iso3'] hdi_dict[uid] = copy.deepcopy(g) hdi_dict[uid].pop('id') hdi_dict[uid].pop('name') hdi_dict['total'] = rank hdi_index_sort = sorted(hdi_index, key=lambda x: x['rank']) hdi_writeout = json.dumps(hdi_index_sort, sort_keys=True, separators=(',', ':')) hdi_out = open('%s/hdi.json' % self.api_path, 'wb') hdi_out.writelines(hdi_writeout) hdi_out.close() jsvalue = "var HDI = " jsondump = json.dumps(hdi_dict, sort_keys=True, separators=(',', ':')) writeout = jsvalue + jsondump f_out = open('%s/hdi.js' % self.api_path, 'wb') f_out.writelines(writeout) f_out.close() self.log('HDI json generated') def extract_years(self, filenames): """Extract years from filenames filenames must be in this format: atlas_projects_2011.xml Arguments: filenames -- an array of filenames """ return [f[-8:-4] for f in filenames] def _grab_award_id(self, text): """ grabs award id from the xml text @example Text: XM-DAC-41114-PROJECT-00068618 Return: 00068618 """ return text.split('-')[-1]
class ProjectsController(Controller): """Main Process class that includes all the functions needed for processing UNDP xml data Main methods: run - Runs the whole class and generate everything """ def __init__(self): self.undp_export = settings.UNDP_EXPORT self.projects = Projects() self.projectsummaries = ProjectSummaries() self.outputs = Outputs() self.subnationals = Subnationals() self.units = Units() self.crsindex = CrsIndex() self.donorindex = DonorIndex() self.countrydonorindex = CountryDonorIndex() self.topdonor_gross = TopDonorGrossIndex() self.topdonor_local = TopDonorLocalIndex() self.donor_ids = DonorIDs() self.region_index = RegionIndex() self.core_donors = CoreDonors() self.operating_unit_index = OperatingUnitIndex() self.api_path = settings.API_PATH self._years = set() self.geo = None # Adding 2010 because the xmls files are starting from 2011 but the legacy site expect to see 2010 self.years = 2010 self.country_donors = None @property def years(self): return self._years @years.setter def years(self, value): self._years.add(value) def generate(self): """ Main method. Execute necessary functions and generate json files """ for files in reversed(self.get_filenames(settings.IATI_XML_ANNUAL)): self._prepare(files, 'iati-activity', 'projects') self._prepare(files, 'iati-activity', 'outputs') # Generating useful info for console counter = 0 for i in self.outputs.collection.values(): counter += len(i) self.log('Total outputs processed: %s' % counter) self.log('Total projects processed: %s' % len(self.projects.pks)) self.log('Total Donor Index processed: %s' % len(self.donorindex.pks)) self.log('Total Country Donor Index processed: %s' % len(self.countrydonorindex.pks)) # Save Project Json files self.projects.save_json(self.outputs, self.subnationals, self.api_path) # Save Unit Json files self.units.save_json(self.subnationals, self.api_path) # Generate Core Donors self._populate_core_donors() self.core_donors.save_json(self.api_path, 'core-donors.json') # Save Summary files self._generate_project_summary(self.projects) self.projectsummaries.save_json(self.api_path) # Save Other Jsons self.crsindex.save_json(self.api_path, 'crs-index.json') self.donorindex.save_json(self.api_path, 'donor-index.json') self.countrydonorindex.save_json(self.api_path, 'donor-country-index.json') self._generate_year_index() # Top Donor Gross Index self._populate_top_donor_gross_index() self.topdonor_gross.save_json(self.api_path, 'top-donor-gross-index.json') # Top Donor Local Index self._populate_top_donor_local_index() self.topdonor_local.save_json(self.api_path, 'top-donor-local-index.json') # Region Index self._populate_region_index() self.region_index.save_json(self.api_path, 'region-index.json') # Focus Area Index focus = FocusAreaIndex() focus.save_json(self.api_path, 'focus-area-index.json') # Generating HDI self._generate_hdi() # Save Operating Unit Index self._populate_operating_unit_index() self.operating_unit_index.save_json(self.api_path, 'operating-unit-index.json') def _prepare(self, xml_file, tag, op_type): """Prepares and executes other methods to prepare the data. Arguments: xml_file - full path to the xml file tag -- one choice is available: iati-activity op_type -- only two choices available: outputs - projects """ # Get IATI activities XML iter_obj = iter(etree.iterparse(xml_file, tag=tag)) # Extract year try: year = int(self.extract_years([xml_file])[0]) self.years = year except ValueError: return func = getattr(self, '_populate_%s' % op_type) func(iter_obj, year) def _populate_operating_unit_index(self): current_year = sorted(list(self.years), reverse=True)[0] country_isos = self.get_and_sort('%s/country_iso.csv' % settings.UNDP_EXPORT, 'iso3') units = self.get_and_sort(self.undp_export + '/report_units.csv', 'operating_unit') iso3 = dict([(i['iso3'].decode('utf-8').encode('ascii', 'ignore'), i['iso_num'].decode('utf-8').encode('ascii', 'ignore')) for i in country_isos]) units_index = dict([(i['operating_unit'], i['fund_type']) for i in units]) # import pdb # pdb.set_trace() for country in self.geo: if country['iso3'] in self.units.pks: obj = OperatingUnit() obj.id.value = country['iso3'] obj.fund_type.value = units_index[obj.id.value] obj.name.value = country[obj.name.key] if country[obj.lat.key] != '': obj.lat.value = country[obj.lat.key] obj.lon.value = country[obj.lon.key] if obj.id.value in iso3: obj.iso_num.value = iso3[obj.id.value] # Looping through project summaries to get total budgets funding_source = set() for project in self.projectsummaries.collection[current_year]: if project.operating_unit.value == obj.id.value: obj.project_count.value += 1 obj.budget_sum.value += round(project.budget.value, 2) obj.expenditure_sum.value += round(project.expenditure.value, 2) for item in project.donors.value: funding_source.add(item) project_obj = self.projects.collection[project.id.value] obj.email.value = project_obj.operating_unit_email.value obj.web.value = project_obj.operating_unit_website.value obj.funding_sources_count.value = len(funding_source) self.operating_unit_index.add(obj.id.value, obj) def _populate_core_donors(self): cores = self.get_and_sort(settings.DONOR_DATA + '/core_fund.csv', 'Donor') for core in cores: obj = CoreDonor() obj.donor_id.value = core['Donor'] obj.description.value = core['Donor Desc'] obj.short_description.value = core['Donor Level 3'] # Adding extra zeros to the begining of donor ids to make them 5 characters additional_zeros = 5 - len(obj.donor_id.value) obj.donor_id.value = '%s%s' % (('0' * additional_zeros), obj.donor_id.value) self.core_donors.add(obj.donor_id.value, obj) def _populate_region_index(self): units = self.get_and_sort(self.undp_export + '/report_units.csv', 'bureau') choices = ['PAPP', 'RBA', 'RBAP', 'RBAS', 'RBEC', 'RBLAC'] for unit in units: if (unit['bureau'] in choices and unit['hq_co'] == 'HQ') or unit['bureau'] == 'PAPP': if unit['ou_descr'] != 'Regional Center - Addis Ababa': obj = Region() obj.name.value = unit['ou_descr'] obj.id.value = unit['bureau'] try: self.region_index.add(obj.id.value, obj) except ObjectExists: pass obj = Region() obj.name.value = 'Global' obj.id.value = 'global' self.region_index.add(obj.id.value, obj) def _populate_top_donor_local_index(self): local = self.get_and_sort(self.undp_export + '/donor_local.csv', 'donor') for item in local: obj = TopDonorLocal() obj.name.value = item[obj.name.key] obj.country.value = item[obj.country.key] obj.amount.value = item[obj.amount.key] obj.donor_id.value = self.donor_ids.collection.get(item['donor'], None) self.topdonor_local.add(obj.donor_id.value, obj) def _populate_top_donor_gross_index(self): gross = self.get_and_sort(self.undp_export + '/donor_gross.csv', 'donor') for item in gross: obj = TopDonor() obj.name.value = item[obj.name.key] obj.country.value = item[obj.country.key] obj.regular.value = item[obj.regular.key] obj.other.value = item[obj.other.key] obj.total.value = item[obj.total.key] obj.donor_id.value = self.donor_ids.collection.get(item['donor'], None) self.topdonor_gross.add(obj.donor_id.value, obj) def _generate_project_summary(self, projects): donors = self.get_and_sort(self.undp_export + '/report_donors.csv', 'awardID') report_donors = ReportDonors() # Create an index of donors based on awardID for item in donors: report_donors.add_update_list(item['awardID'], item) try: self.donor_ids.add(item['donor_type_lvl3_descr'], item['donorID']) except ObjectExists: pass regionsList = ['PAPP', 'RBA', 'RBAP', 'RBAS', 'RBEC', 'RBLAC'] # Looping through years of projects counter = 0 for project in projects.collection.values(): for year in project.fiscal_year.value: # Should create a new model instance for each year of the project as they are stored in separate # summary files obj = ProjectSummary() # set region if project.region_id.value not in regionsList: obj.region.value = 'global' else: obj.region.value = project.region_id.value obj.operating_unit.value = project.operating_unit_id.value obj.name.value = project.project_title.value obj.id.value = project.project_id.value obj.fiscal_year.value = year # Fill out fields from report donors list try: country = defaultdict(lambda: defaultdict(float)) for item in report_donors.collection[project.project_id.value]: if int(item['fiscal_year']) == int(year) and item['donorID']: country[item['donorID']]['budget'] += float(item['budget']) country[item['donorID']]['expenditure'] += float(item['expenditure']) country[item['donorID']]['type'] = item['donor_type_lvl1'].replace(" ", "") if item['donor_type_lvl1'] == 'PROG CTY' or item['donor_type_lvl1'] == 'NON_PROG CTY': country[item['donorID']]['name'] = item['donor_type_lvl3'].replace(" ", "") elif item['donor_type_lvl1'] == 'MULTI_AGY': country[item['donorID']]['name'] = item['donor_type_lvl1'].replace(" ", "") else: country[item['donorID']]['name'] = 'OTH' # country[item['donorID']]['name'] = item['donor_type_lvl3'] if item['donorID'] == '00012': obj.core.value = True for key, value in country.iteritems(): obj.donor_countries.value.append(value['name']) obj.donor_budget.value.append(value['budget']) obj.donor_expend.value.append(value['expenditure']) obj.donor_types.value.append(value['type']) obj.donors.value.append(key) except KeyError: # There are few projects ids that are not appearing the donor list. this catch resolve them pass obj.expenditure.value = sum(obj.donor_expend.value) obj.budget.value = sum(obj.donor_budget.value) # Get other information from outputs for output in project.outputs.value: obj.crs.value.add(output['crs']) obj.focus_area.value.add(output['focus_area']) self.projectsummaries.add_update_list(year, obj) counter += 1 self.log('%s summary projects processed' % counter) def _generate_year_index(self): """ Generates year-index.js """ writeout = 'var FISCALYEARS = %s' % sorted(map(str, list(self.years)), reverse=True) f_out = open('%s/year-index.js' % self.api_path, 'wb') f_out.writelines(writeout) f_out.close() self.log('Year Index Generated') def _populate_units(self, project_obj): """ Fill Units collections """ unit_project = UnitProject() unit_project.title.value = project_obj.project_title.value unit_project.id.value = project_obj.project_id.value if project_obj.operating_unit_id.value in self.units.pks: self.units.collection[project_obj.operating_unit_id.value].projects.value.append(unit_project.to_dict()) else: unit = Unit() unit.op_unit.value = project_obj.operating_unit_id.value unit.projects.value.append(unit_project.to_dict()) self.units.add(project_obj.operating_unit_id.value, unit) def _populate_projects(self, iter_obj, yr): """Loop through the iter_obj to and sort/clean data based project_id Produced a list of dictionaries. Sample: {'end': '2012-12-31', 'operating_unit_email': '*****@*****.**', 'inst_id': '', 'operating_unit': 'Lithuania, Republic of', 'iati_op_id': 'LT', 'inst_descr': '', 'start': '2005-01-01', 'operating_unit_id': 'LTU', 'operating_unit_website': 'http://www.undp.lt/', 'project_id': '00038726', 'inst_type_id': '', 'document_name': u'http://www.undp.org/content/dam/undp/documents/projects/LTU/00038726/RC fund.pdf'} Arguments: iter_obj - and iteratble etree object """ counter = 0 # Get sorted units report_units = self.get_and_sort(self.undp_export + '/report_units.csv', 'operating_unit') # Loop through each IATI activity in the XML for event, p in iter_obj: # IATI hierarchy used to determine if output or input1 hierarchy = p.attrib['hierarchy'] # Check for projects if hierarchy == '1': obj = Project() obj.project_id.value = p[1].text.split('-', 2)[2] # Check if the project_id is unique if obj.project_id.value in self.projects.pks: continue obj.fiscal_year.value.append(yr) obj.project_title.value = p.find(obj.project_title.xml_key).text.lower() obj.project_descr.value = p.find(obj.project_descr.xml_key).text documents = p.findall('./document-link') if documents: names = [] links = [] for doc in documents: links.append(urllib2.unquote(doc.get('url')).decode('utf-8')) for d in doc.iterchildren(tag=obj.document_name.key): names.append(d.text) obj.document_name.value.extend([names, links]) # Find start and end dates obj.start.value = p.find(obj.start.xml_key).text obj.end.value = p.find(obj.end.xml_key).text contact = p.findall('./contact-info') obj.operating_unit_email.value = [e.text for email in contact for e in email.iterchildren(tag=obj.operating_unit_email.key)][0] # Find operating_unit # If recipient country didn't exist look for recipient region try: obj.iati_op_id.value = (p.find(obj.iati_op_id.xml_key).attrib.get('code')) obj.operating_unit.value = p.find(obj.operating_unit.xml_key).text for r in report_units: if (obj.iati_op_id.value == r['iati_operating_unit'] or obj.iati_op_id.value == r['operating_unit']): obj.operating_unit_id.value = r['operating_unit'] obj.region_id.value = r[obj.region_id.key] except: region_unit = p.findall("./recipient-region") for ru in region_unit: for r in report_units: if ru.text == r['ou_descr']: obj.operating_unit_id.value = r['operating_unit'] obj.operating_unit.value = r['ou_descr'] obj.iati_op_id.value = '998' # find contact info try: for email in contact: for e in email.iterchildren(tag=obj.operating_unit_email.key): obj.operating_unit_email.value = e.text obj.operating_unit_website.value = p.find(obj.operating_unit_website.xml_key).text except: pass # Check for implementing organization try: inst = p.find("./participating-org[@role='Implementing']") obj.inst_id.value = inst.attrib.get(obj.inst_id.key) obj.inst_type_id.value = inst.attrib.get(obj.inst_type_id.key) obj.inst_descr.value = inst.text except: pass # Populate the Unit Collection self._populate_units(obj) counter += 1 self.log('Processing: %s' % counter, True) self.projects.add(obj.project_id.value, obj) self.log('%s - Project Annuals: %s rows processed' % (yr, counter)) def _populate_outputs(self, iter_obj, yr): counter = 0 # Get sorted country donoros sorted_donors = self.get_and_sort(self.undp_export + '/country_donors_updated.csv', 'id') for event, o in iter_obj: hierarchy = o.attrib['hierarchy'] if hierarchy == '2': obj = Output() crs = Crs() obj.output_id.value = o[1].text.split('-', 2)[2] # Check if the project_id is unique if obj.output_id.value in self.outputs.output_ids: continue obj.output_title.value = o.find(obj.output_title.xml_key).text obj.output_descr.value = o.find(obj.output_descr.xml_key).text try: obj.gender_id.value = o.find(obj.gender_descr.xml_key).attrib.get(obj.gender_id.key) obj.gender_descr.value = o.find(obj.gender_descr.xml_key).text except: obj.gender_id.value = "0" obj.gender_descr.value = "None" try: obj.crs.value = o.find(obj.crs_descr.xml_key).get(obj.crs.key) crs.name.value = obj.crs.value except AttributeError: pass try: obj.crs_descr.value = o.find(obj.crs_descr.xml_key).text crs.id.value = obj.crs_descr.value except AttributeError: pass try: self.crsindex.add(crs.id.value, crs) except ObjectExists: pass try: obj.award_id.value = (o.find(obj.award_id.xml_key).get('ref').split('-', 2)[2]) except: obj.award_id.value = (o.find("./related-activity[@type='2']").get('ref').split('-', 2)[2]) try: obj.focus_area.value = o.find(obj.focus_area_descr.xml_key).get(obj.focus_area.key) obj.focus_area_descr.value = o.find(obj.focus_area_descr.xml_key).text except: obj.focus_area.value = "-" obj.focus_area_descr.value = "-" for donor in o.findall("./participating-org[@role='Funding']"): ref = donor.get('ref') obj.donor_id.value.add(ref) if ref == '00012': obj.donor_name.value.append('Voluntary Contributions') else: obj.donor_name.value.append(donor.text) for d in sorted_donors: # Check IDs from the CSV against the cntry_donors_sort. # This provides funding country names not in XML if d['id'] == ref: # for outputs obj.donor_short.value.append(d[obj.donor_short.key]) # Find budget information to later append to projectFY array budget_expend = defaultdict(lambda: defaultdict(float)) obj.budget.temp = o.findall(obj.budget.xml_key) for budget in obj.budget.temp: for b in budget.iterchildren(tag='value'): year = int(b.get('value-date').split('-', 3)[0]) budget_expend[year]['budget'] = float(b.text) # Use transaction data to get expenditure for tx in o.findall('transaction'): for expen in tx.findall(obj.expenditure.xml_key): for sib in expen.itersiblings(): if sib.tag == 'value': year = int(sib.get('value-date').split('-', 3)[0]) budget_expend[year]['expenditure'] = float(sib.text) for key, value in budget_expend.iteritems(): obj.fiscal_year.value.append(key) obj.budget.value.append(value['budget']) obj.expenditure.value.append(value['expenditure']) # Run subnationals locations = o.findall('location') if locations: self._populate_subnationals(obj.award_id.value, obj, o, locations) # Populate Donor Index self._populate_donor_index(o) counter += 1 self.log('Processing: %s' % counter, True) self.outputs.add_update_list(obj.award_id.value, obj) self.log('%s - output Annuals: %s rows processed' % (yr, counter)) def _populate_subnationals(self, project_id, output_obj, node, locations): """ Populate subnational object. This is dependant on _populate_outputs and cannot be executed separately project_id - the related project_id output_id - output model object node - output xml object Returns: Populatess subnationals property """ obj = Subnational() counter = 0 for location in locations: counter += 1 obj.awardID.value = project_id obj.outputID.value = output_obj.output_id.value obj.output_locID.value = "%s-%d" % (obj.outputID.value, counter) # Focus areas obj.focus_area.value = output_obj.focus_area.value obj.focus_area_descr.value = output_obj.focus_area_descr.value for item in location.iterchildren(): if item.tag == 'coordinates': obj.lat.value = item.get(obj.lat.key) obj.lon.value = item.get(obj.lon.key) obj.precision.value = item.get(obj.precision.key) if item.tag == 'name': obj.name.value = item.text if item.tag == 'location-type': obj.type.value = item.get(obj.type.key) self.subnationals.add_update_list(project_id, obj) def _populate_donor_index(self, output_obj): """ Populates both donor-index and donor-country-index """ if not self.country_donors: self.country_donors = self.get_and_sort(self.undp_export + '/country_donors_updated.csv', 'id') for donor in output_obj.findall("./participating-org[@role='Funding']"): obj = Donor() country_obj = CountryDonor() ref = donor.get(obj.id.key) if ref: for item in self.country_donors: if ref == item['id']: # Skip the loop if the ref already is added if ref not in self.donorindex.pks: obj.id.value = ref obj.name.value = donor.text or "Unknown" if item['donor_type_lvl1'] == 'PROG CTY' or item['donor_type_lvl1'] == 'NON_PROG CTY': obj.country.value = item['donor_type_lvl3'].replace(" ", "") elif item['donor_type_lvl1'] == 'MULTI_AGY': obj.country.value = item['donor_type_lvl1'].replace(" ", "") else: obj.country.value = 'OTH' self.donorindex.add(obj.id.value, obj) if item['donor_type_lvl3'] not in self.countrydonorindex.pks: country_obj.id.value = item['donor_type_lvl3'] country_obj.name.value = item['donor_type_lvl3_descr'] self.countrydonorindex.add(item['donor_type_lvl3'], country_obj) def _search_list_dict(_list, key, search): result = [item for item in _list if item[key] == search] if len(result) > 0: return result else: return False def _generate_hdi(self): hdi = self.get_and_sort('%s/hdi-csv-clean.csv' % settings.HDI, 'hdi2013') self.geo = self.get_and_sort('%s/country-centroids.csv' % settings.PROCESS_FILES, 'iso3') # Add current year to the years array years = [1980, 1985, 1990, 1995, 2000, 2005, 2006, 2007, 2008, 2011, 2012, 2013] # Set current year to the latest year of HDI Data current_year = 2013 row_count = 0 rank = 0 hdi_index = [] hdi_dict = {} for val in iter(hdi): row_count = row_count + 1 hdi_total = [] hdi_health = [] hdi_ed = [] hdi_inc = [] change = [] change_year = {} for y in years: if val['hdi%d' % y] != '': if val['ed%d' % y] != "" and val['health%d' % y] != "" and val['income%d' % y] != "": hdi_total.append([y, round(float(val['hdi%d' % y]), 3)]) hdi_health.append([y, round(float(val['health%d' % y]), 3)]) hdi_ed.append([y, round(float(val['ed%d' % y]), 3)]) hdi_inc.append([y, round(float(val['income%d' % y]), 3)]) if y != current_year: change_year = round(float(val['hdi%d' % current_year]), 3) - round(float(val['hdi%d' % y]), 3) if len(change) == 0: change.append(change_year) if len(change) == 0: change.append("") for ctry in self.geo: if ctry['name'] == val['country']: if val['hdi%d' % current_year] == "": g = { "id": ctry['iso3'], "name": val['country'], "hdi": "", "health": "", "income": "", "education": "", "change": change[0], "rank": "n.a." } else: if ctry['iso3'].rfind("A-", 0, 2) == 0: g = { "id": ctry['iso3'], "name": val['country'], "hdi": hdi_total, "health": hdi_health, "income": hdi_inc, "education": hdi_ed, "change": change[0], "rank": "n.a." } else: rank = rank + 1 g = { "id": ctry['iso3'], "name": val['country'], "hdi": hdi_total, "health": hdi_health, "income": hdi_inc, "education": hdi_ed, "change": change[0], "rank": rank } hdi_index.append(g) uid = ctry['iso3'] hdi_dict[uid] = copy.deepcopy(g) hdi_dict[uid].pop('id') hdi_dict[uid].pop('name') hdi_dict['total'] = rank hdi_index_sort = sorted(hdi_index, key=lambda x: x['rank']) hdi_writeout = json.dumps(hdi_index_sort, sort_keys=True, separators=(',', ':')) hdi_out = open('%s/hdi.json' % self.api_path, 'wb') hdi_out.writelines(hdi_writeout) hdi_out.close() jsvalue = "var HDI = " jsondump = json.dumps(hdi_dict, sort_keys=True, separators=(',', ':')) writeout = jsvalue + jsondump f_out = open('%s/hdi.js' % self.api_path, 'wb') f_out.writelines(writeout) f_out.close() self.log('HDI json generated') def extract_years(self, filenames): """Extract years from filenames filenames must be in this format: atlas_projects_2011.xml Arguments: filenames -- an array of filenames """ return [f[-8:-4] for f in filenames]