def getactivities(mepid, terms=[8]): urltpl = 'http://www.europarl.europa.eu/meps/en/%s/see_more.html?type=%s&leg=%s&index=%s' #ctjson={'content-type': 'application/json'} actions={} for type in activitymap.keys(): actions[type]={} for term in terms: term=str(term) actions[type][term]=[] idx=0 while True: _url = urltpl % (mepid,type,term,idx) try: res=fetch_raw(_url, ignore=[500]) #, headers=ctjson) except: logger.warn("failed to fetch %s" % _url) break if res is None: break if '<h2>Error while collecting data</h2>' in res: break ret=json.loads(res) actions[type][term].extend(ret['documentList']) idx=ret['nextIndex'] if idx in [-1,0]: break if not actions[type][term]: del actions[type][term] if not actions[type]: del actions[type] return actions
def scrape_docs(tree): res=[] docs=tree.xpath('//table[@id="doc_gateway"]') tabs=[x.xpath('preceding-sibling::h2')[0].xpath('text()')[0] for x in docs] for inst, table in izip(tabs, docs): if inst in instmap.keys(): for doc in lst2obj(table, docFields): if inst != 'Other institutions': doc[u'body']=instmap[inst] else: try: doc[u'body']=otherinst[doc['type'].split(':')[0]] except KeyError: doc[u'body']='' if doc['body'] in ['EP','CSL'] and doc['type']=='Joint text approved by Conciliation Committee co-chairs': # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :) continue if doc.get('text'): try: summary=fetch(doc['text']['url']) except: continue doc[u'text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')] res.append(doc) elif inst != 'All': logger.warn(u"[!] unrecognized tab in documents %s" % inst) return res
def save(data, stats): res=Mep.get_by_id(data['UserID']) if res is not None: if 'Gender' not in data and 'Gender' in res.data: data['Gender']=res['Gender'] d=diff(dict([(k,v) for k,v in res.data.items() if not k in ['meta', 'changes', 'activities',]]), dict([(k,v) for k,v in data.items() if not k in ['meta', 'changes', 'activities',]])) data['changes']=res.data.get('changes',{}) else: d=diff({}, dict([(k,v) for k,v in data.items() if not k in ['meta', 'changes', 'activities',]])) data['changes']={} if d: now=datetime.utcnow().replace(microsecond=0) if not res: logger.info('adding %s' % (data['Name']['full'])) data['meta']['created']=now if stats: stats[0]+=1 data['changes']={} else: logger.info('updating %s' % (data['Name']['full'])) logger.warn(jdump(d)) data['meta']['updated']=now if stats: stats[1]+=1 data['id']=res.id data['changes']=res.data.get('changes',{}) data['changes'][now.isoformat()]=d Mep.upsert(data) del res if stats: del data return stats else: return data
def getMEPDeclarations(id): try: dom = fetch("http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500]) except Exception as e: logger.error("mepdeclaration %s" % e) return [] dif_links = dom.xpath('//h3[@id="sectionDIF"]/following-sibling::div//ul[@class="link_collection_noborder"]//a[@class="link_pdf"]/@href') dat_links = dom.xpath('//h3[@id="sectionDAT"]/following-sibling::div//ul[@class="link_collection_noborder"]//a[@class="link_pdf"]/@href') if not dif_links: logger.warn('[!] no declaration data http://www.europarl.europa.eu/meps/en/%s/_declarations.html' % id) return dif_links, dat_links
def getMEPRef(name): if not name: return mep=Mep.get_by_name(''.join(name.split()).lower()) if not mep and u'ß' in name: mep=Mep.get_by_name(''.join(name.replace(u'ß','ss').split()).lower()) if not mep and unicodedata.normalize('NFKD', unicode(name)).encode('ascii','ignore')!=name: mep=Mep.get_by_name(''.join(unicodedata.normalize('NFKD', unicode(name)).encode('ascii','ignore').decode('utf8').split()).lower()) if not mep and len([x for x in name if ord(x)>128]): logger.warn('mep name contains non-ascii chars and was not found %s' % name) # todo #mep=db.ep_meps2.find_one({'Name.aliases': re.compile(''.join([x if ord(x)<128 else '.' for x in name]),re.I)},retfields) #mep=Mep.get_by_name() if mep: return mep else: logger.warn('[!] lookup oops %s' % name.encode('utf8'))
def scrape_epagents(table): heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip() responsible=None if heading in [ "Committee responsible", "Former committee responsible"]: responsible=True elif heading in ["Committee for opinion", "Former committee for opinion"]: responsible=False else: logger.warn(u"[!] unknown committee heading %s" % heading) # handle shadows shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a') tips=[t.xpath('text()')[0] if len(t.xpath('text()'))>0 else groupurlmap[t.xpath("a")[0].get('href')] if len(t.xpath("a"))>0 else groupurlmap[t.xpath("img")[0].get('src')] for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')] shadows={} for shadow, group in izip_longest(shadowelems, tips): committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0] if not committee in shadows: shadows[committee]=[] if group=='NI': group=u'NI' mep={u'name': unicode(shadow.xpath('text()')[0]), u'group': unicode(group)} tmp=getMEPRef(shadow.xpath('text()')[0]) if tmp: mep[u'mepref']=tmp #else: # raise IndexError shadows[committee].append(mep) # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'): parent=todel.xpath('..')[0] parent.remove(todel.xpath('following-sibling::div')[0]) parent.remove(todel) # handle each row of agents agents=[] for agent in lst2obj(table,epagents,1): agent[u'responsible']=responsible agent[u'body']=u'EP' if agent.get('rapporteur'): meps=[] for mep in agent['rapporteur']: if unws(mep['name']).startswith("The committee decided not to give an opinion"): del agent['rapporteur'][agent['rapporteur'].index(mep)] agent[u'opinion']=None continue tmp=getMEPRef(mep['name']) if tmp: meps.append({u'mepref': tmp, u'group': mep['group'], u'name': mep['name']}) else: meps.append({u'group': mep['group'], u'name': mep['name']}) agent[u'rapporteur']=meps abbr=agent['committee'][:4] if abbr=='BUDE': abbr='BUDG' if not abbr in COMMITTEE_MAP.keys(): logger.warn(u"[!] uknown committee abbrev %s" % abbr) agent[u'committee_full']=agent['committee'] if agent['committee'][4]==' ' and abbr.isalpha(): agent[u'committee']=abbr else: agent[u'committee_full']=agent['committee'][5:] agent[u'committee']=abbr if agent.get(u'committee') in shadows.keys(): agent[u'shadows']=shadows[agent['committee']] if not agent in agents: agents.append(agent) return agents
def merge_events(events,committees,agents): bydate={} for event in events: if not event['date'] in bydate: bydate[event['date']]=[event] else: bydate[event['date']].append(event) #pprint.pprint(sorted([(k,[dict([(k1,v1) for k1,v1 in i.items() if k1!='text']) for i in v]) for k,v in bydate.items()])) res=[] # merge items to events. for date, items in bydate.items(): actors={} # collects items/actor for a given date for item in items: if not item.get('body'): # find out body, or drop body=stage2inst.get(item.get('type')) if body: item[u'body']=body elif item.get('type')=='Final act published in Official Journal': # this really has no body or all res.append(item) continue else: logger.warn('unknown body: %s' % item.get('type')) item[u'body']='unknown' # new institution for this date if not item['body'] in actors: # new body for this date actors[item['body']]=item if 'doc' in actors[item['body']]: docs=merge_new_docs(actors[item['body']]['doc'], item) del actors[item['body']]['doc'] actors[item['body']][u'docs']=docs cmts=getCommittee(item,committees) if cmts: actors[item['body']][u'committees']=sorted(cmts, key=itemgetter('committee')) if item['body']=='EC': actors[u'EC'][u'commission']=sorted([{u'DG': x['dg'], u'Commissioner': x['commissioner']} if x.get('commissioner') else {u'DG': x['dg']} for x in agents if x['body']=='EC']) continue # merge any docs if 'doc' in item: docs=merge_new_docs(item['doc'], item) for doc in docs: skip=False # update docs, that are already in there, but with a different 'type' for cdoc in actors[item['body']].get('docs',[]): if cdoc.get('url')==doc.get('url') or cdoc.get('title')==doc.get('title'): cdoc.update(doc) skip=True break if skip: continue try: actors[item['body']][u'docs'].append(doc) except KeyError: actors[item['body']][u'docs']=[doc] del item['doc'] # merge any fields not yet in the actor actors[item['body']].update([(k,v) for k,v in item.items() if k not in actors[item['body']]]) res.extend([x for x in actors.values() if x]) #pprint.pprint(sorted(res, key=itemgetter('date'))) #pprint.pprint(sorted([dict([(k1,v1) for k1,v1 in v.items() if k1!='text']) for v in res], key=itemgetter('date'))) return res
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = { u'active': False, u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)), u'meta': {u'url': url} } mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()')))) borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()') if len(borntxt)>0: if unws(borntxt[-1]).startswith('Date of death:'): try: data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y") except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) tmp = borntxt[-2].split(',', 1) else: tmp = borntxt[-1].split(',', 1) if len(tmp)==2: (d, p) = tmp else: d,p = tmp[0], None try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")} except ValueError: logger.warn(traceback.format_exc()) finally: if p: if 'Birth' in data: data[u'Birth'][u'place'] = unws(p) else: data[u'Birth'] = unws(p) else: logger.warn('[!] no birth data %s' % url) # scrape stuff from right column addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')]) addif(data,u'Homepage',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')]) addif(data,u'Twitter',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')]) addif(data,u'Facebook',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')]) addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1] for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')]) # contact information for span in root.xpath('//div[@id="content_right"]//h3'): title=unws(''.join(span.xpath('.//text()'))) if title == "Contacts": addif(data,u'Addresses',getAddress(span)) # scrape main content for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'): key=unws(''.join(section.xpath('.//text()'))) if key=="National parties": # constituencies key='Constituencies' for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, party = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if not key in data: data[key]=[] if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" cstart = party.rfind(' (') if party[cstart+2:-1] in SEIRTNUOC: country = party[cstart+2:-1] party = party[:cstart] else: logger.warn('unknown country: %s' % party[cstart+2:-1]) country='unknown' #print etree.tostring(constlm, pretty_print=True) data[key].append({ u'party': party, u'country': country, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor', 'Substitute observer']: # memberships in various committees, delegations and EP mgt for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, org = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" item={u'role': key, u'abbr': COMMITTEE_MAP.get(org), u'Organization': org, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), } for start, field in orgmaps: if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break elif key == u'Political groups': for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) interval, org = line.split(' : ',1) tmp = org.split(u' - ') if len(tmp)>1: org = ' - '.join(tmp[:-1]) role = tmp[-1] elif org.endswith(' -'): org=org[:-2] role='' else: logger.error('[!] political group line %s' % line) continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" if not u'Groups' in data: data[u'Groups']=[] data[u'Groups'].append( {u'role': role, u'Organization': org, u'country': COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))), u'groupid': group_map[org], u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) else: logger.error('[!] unknown field %s' % key) # sort all lists in descending order for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']: if not fld in data: continue data[fld]=sorted(data[fld], key=lambda x: x.get('end',x['start']), reverse=True) # get CV - page (is on separate http path :/) cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid root = fetch(cvurl, ignore=[500]) data[u'CV']={} for sec in root.xpath('//h3[@class="collapsible"]'): section=unws(''.join(sec.xpath('.//text()'))) data[u'CV'][section]=[] for line in sec.xpath('./following-sibling::div[1]//li'): data[u'CV'][section].append(unws(''.join(line.xpath('.//text()')))) # get assistants also on a separate page :/ assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid root = fetch(assurl, ignore=[500]) for h3 in root.xpath('//h3[@id="section"]'): title=unws(''.join(h3.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower().split()[0], [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower(), [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) return data