def crawl(term, test=[], **kwargs): seen = set() url = "https://www.europarl.europa.eu/committees/en/documents/search?committeeMnemoCode=%s&textualSearchMode=TITLE&textualSearch=&documentTypeCode=AGEN&reporterPersId=&procedureYear=&procedureNum=&procedureCodeType=&peNumber=&aNumber=&aNumberYear=&documentDateFrom=&documentDateTo=&meetingDateFrom=&meetingDateTo=&performSearch=true&term=%s&page=%s&pageSize={}".format( itemsPerPage) jobs = [] for com in (k for k in test or COMMITTEE_MAP.keys() if len(k) == 4): i = 0 log(3, 'crawling %s, term: %s' % (com, term)) try: root = fetch(url % (com, term, i)) except requests.exceptions.HTTPError as e: #if e.response.status_code == 500: log( 3, "failed to get list of draft agendas for %s in term %d, http error code: %s" % (com, term, e.response.status_code)) continue prev = [] while True: log( 3, "crawling comagenda search page %s for %s term %s" % (i, com, term)) tmp = [] for a in root.xpath('//div[@class="erpl_document-header"]/h3/a'): u = a.get('href', '') if (len(u) <= 13): log(2, 'url is too short, skipping: "%s"' % u) continue if u in seen: log(3, "skipping url: %s" % repr(u)) continue seen.add(u) tmp.append(u) try: payload = dict(kwargs) payload['url'] = u payload['committee'] = com if test: print(payload) else: add_job('comagenda', payload=payload) except: print(u) if not tmp or prev == tmp or len(tmp) < itemsPerPage: break prev = tmp i += 1 try: root = fetch(url % (com, term, i)) except requests.exceptions.HTTPError as e: if e.response.status_code == 500: log( 3, "failed to page %s of draft agendas for %s in term %d" % (i, com, term)) break
def get_all_dossiers(): for year in xrange(datetime.date.today().year, 1972, -1): tree=fetch('http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/????\(*\)' % (year)) count=int(tree.xpath('//span[@class="ep_title resultNum pdfHide"]/text()')[0].strip()[len('Results found: '):]) tree=fetch('http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&limit=%s&noHeader=false&q=objectReferenceN:N-%s/????\(*\)' % (count,year)) links=tree.xpath('//a[@class="reference rssEntry_id rssEntry_title rssEntry_updated"]') for link in links: yield (urljoin(BASE_URL,link.get('href')), (link.xpath('text()') or [''])[0])
def scrape(target): url,title=target try: logger.info('scrape '+url) tree=fetch(url) agents,committees=scrape_actors(tree) forecasts=lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0],forecastFields) events=scrape_events(tree) procedure=scrape_basic(tree) if not procedure: return allevents=agents+scrape_docs(tree)+events+forecasts other=[x for x in allevents if not x.get('date')] allevents=sorted([x for x in allevents if x.get('date')],key=itemgetter('date')) allevents=merge_events(allevents,committees, agents) res={u'meta': {'source': url, 'timestamp': datetime.datetime.utcnow() }, u'procedure': procedure, u'links': form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]), u'committees': committees, u'activities': sorted(allevents, key=itemgetter('date')), u'other': other, } tmp=url.split('id=') if len(tmp)>1: res['meta']['id']=int(tmp[1]) # check for "final act" finalas=tree.xpath('//div[@id="final_act"]//a') final={} for link in finalas: if link.get('class')=='sumbutton': try: summary=fetch("http://www.europarl.europa.eu%s" % link.get('href')) except: continue final['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')] else: if not 'docs' in final: final['docs']=[] final['docs'].append({'title': link.xpath('text()')[0].strip(), 'url': link.get('href')}) if final and final.get('docs'): res[u'procedure'][u'final']=final.get('docs',[{}])[0] for item in res['activities']: if item.get('type')==u'Final act published in Official Journal': if final.get('text'): item[u'text']=final['text'] if len(final.get('docs'))>1: if not 'docs' in item: item[u'docs']=final['docs'] else: item[u'docs'].extend(final['docs']) break return res except: logger.error("%s\n%s" % (url,traceback.format_exc())) return
def crawl(term, update=False, test=[], **kwargs): seen = set() url="https://www.europarl.europa.eu/committees/en/documents/search?committeeMnemoCode=%s&textualSearchMode=TITLE&textualSearch=&documentTypeCode=AMCO&reporterPersId=&procedureYear=&procedureNum=&procedureCodeType=&peNumber=&aNumber=&aNumberYear=&documentDateFrom=&documentDateTo=&meetingDateFrom=&meetingDateTo=&performSearch=true&term=%s&page=%s&pageSize={}".format(itemsPerPage) jobs = [] for com in (k for k in test or COMMITTEE_MAP.keys() if len(k)==4): i=0 log(3,'crawling %s, term: %s' % (com, term)) try: root=fetch(url % (com, term, i)) except requests.exceptions.HTTPError as e: #if e.response.status_code == 500: log(3, "failed to get list of amendments for %s in term %d, http error code: %s" % (com, term, e.response.status_code)) continue prev=[] while True: log(3, "crawling amendments search page %s for %s term %s" % (i, com, term)) tmp=[] for a in root.xpath('//a[@class="erpl_document-subtitle-pdf"]'): u=a.get('href','') if (len(u)<=13): log(2,'url is too short, skipping: "%s"' % u) continue if u in seen or u in skipurls or (not u.endswith('EN') and not u.endswith('_EN.pdf')): log(3,"skipping url: %s" % repr(u)) continue seen.add(u) tmp.append(u) rs = a.xpath('../../following-sibling::div/span[@class="erpl_document-subtitle-author"]') r = [y for y in [junws(x) for x in rs] if y] try: payload = dict(kwargs) payload['url'] = u payload['meps'] = r if test: print(payload) else: add_job('amendment', payload=payload) except: print(u, r) if not tmp or prev==tmp or len(tmp) < itemsPerPage: break prev=tmp if update: break i+=1 try: root=fetch(url % (com, term, i)) except requests.exceptions.HTTPError as e: if e.response.status_code == 500: log(3, "failed to page %s of draft agendas for %s in term %d" % (i, com, term)) break
def crawler(query='current'): if query=='unlisted': for mep in unlisted: yield mep elif query=='all': for letter in xrange(26): tmp=meplists[query] a=ord('A') root=fetch(tmp%chr(a+letter), ignore=[500]) for meplm in root.xpath('//id/text()'): yield int(meplm) else: root=fetch(meplists[query], ignore=[500]) for meplm in root.xpath('//id/text()'): yield int(meplm)
def get_all_dossiers(**kwargs): for year in range(datetime.date.today().year, 1971, -1): tree = fetch( 'https://oeil.secure.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/*\(*\)' % (year)) tmp = tree.xpath( '//span[@class="ep_name" and (starts-with(normalize-space(),"Results found :") or starts-with(normalize-space(),"Result found :"))]/text()' ) if not tmp: log(1, "no dossiers found for %d" % year) raise ValueError("failed to find number of dossiers for year %d" % year) tmp = unws(tmp[0]) count = int(tmp[tmp.index(":") + 1:]) log(4, "year %d, count %d" % (year, count)) #tree=fetch('https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/????\(*\)&lang=en&s1&all&limit=%s&lang=en' # % (year, count), prune_xml=True) tree = fromstring( fetch_raw( 'https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/*\(*\)&lang=en&s1&all&limit=%s&lang=en' % (year, count)).encode("utf8")) items = tree.xpath('//item') i = 0 for item in items: url = html.unescape( urljoin(BASE_URL, str(item.xpath('./link/text()')[0]))) ref = unws(item.xpath('./reference/text()')[0]) if '*' in ref: ref = ref[:ref.index('*')] log(4, 'adding dossier scraping job %s' % url) payload = dict(kwargs) payload['url'] = url add_job('dossier', payload=payload) i += 1 if i != count: log(1, "total %d, expected %d" % (i, count))
def scrape_docs(tree): res=[] docs=tree.xpath('//table[@id="doc_gateway"]') tabs=[x.xpath('preceding-sibling::h2')[0].xpath('text()')[0] for x in docs] for inst, table in izip(tabs, docs): if inst in instmap.keys(): for doc in lst2obj(table, docFields): if inst != 'Other institutions': doc[u'body']=instmap[inst] else: try: doc[u'body']=otherinst[doc['type'].split(':')[0]] except KeyError: doc[u'body']='' if doc['body'] in ['EP','CSL'] and doc['type']=='Joint text approved by Conciliation Committee co-chairs': # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :) continue if doc.get('text'): try: summary=fetch(doc['text']['url']) except: continue doc[u'text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')] res.append(doc) elif inst != 'All': logger.warn(u"[!] unrecognized tab in documents %s" % inst) return res
def scrape(all=False, **kwargs): if all: sources = ['http://www.europarl.europa.eu/meps/en/directory/xml?letter=&leg='] else: sources = ['http://www.europarl.europa.eu/meps/en/incoming-outgoing/incoming/xml', 'http://www.europarl.europa.eu/meps/en/incoming-outgoing/outgoing/xml', 'http://www.europarl.europa.eu/meps/en/full-list/xml'] payload={} if 'onfinished' in kwargs: payload['onfinished']=kwargs['onfinished'] if all: actives = {e['UserID'] for e in db.meps_by_activity(True)} inactives = {e['UserID'] for e in db.meps_by_activity(False)} meps = actives | inactives for unlisted in [ 1018, 26833, 1040, 1002, 2046, 23286, 28384, 1866, 28386, 1275, 2187, 34004, 28309, 1490, 28169, 28289, 28841, 1566, 2174, 4281, 28147, 28302, ]: meps.discard(unlisted) payload['id']=unlisted add_job('mep', dict(payload)) for src in sources: root = fetch(src, prune_xml=True) for id in root.xpath("//mep/id/text()"): if all: meps.discard(int(id)) payload['id']=int(id) add_job('mep', dict(payload)) if all: log(3,"mepids not in unlisted nor in directory {!r}".format(meps)) for id in meps: payload['id']=id add_job('mep', dict(payload))
def scrape(url): log(4, "scraping %s" % url) root = fetch(url) res = {'responsible': [], 'opinions': []} for opinion in root.xpath( '//p/span[contains(text(),"FINAL VOTE BY ROLL CALL IN COMMITTEE ASKED FOR OPINION")]' ): procedure = opinion.xpath( '../../p/span[contains(text(),"PROCEDURE – COMMITTEE ASKED FOR OPINION")]' ) if len(procedure) != 1: log( 1, "found %s procedures for opinion in %s" % (len(procedure), url)) raise ValueError proc_table = procedure[0].xpath('../following-sibling::p/table') proc = extract_proc(proc_table, url) date = datetime.strptime(proc['Date adopted'], "%d.%m.%Y") cmte = proc['Opinion by Date announced in plenary'].split()[0] res_op = {'proc': proc, 'date': date, 'committee': cmte, 'votes': {}} res['opinions'].append(res_op) for table in opinion.xpath('../following-sibling::p/table'): if table == proc_table[0]: continue vote = extract_table(table, url, date) res_op['votes'][vote['type']] = vote del (vote['type']) responsible = root.xpath( '//tr[@class="doc_title"]//span[contains(text(),"FINAL VOTE BY ROLL CALL IN COMMITTEE RESPONSIBLE")]' ) if len(responsible) != 1: log(1, "number of responsible rc votes is not 1: %s" % url) raise ValueError responsible = responsible[0] proc = root.xpath( '//tr[@class="doc_title"]//span[contains(text(),"PROCEDURE – COMMITTEE RESPONSIBLE")]' ) if len(proc) != 1: log( 1, "could not find exactly one procedure for the responsible committee in %s" % url) raise ValueError proc = extract_proc( proc[0].xpath('../../following-sibling::tr/td/p/table'), url) cmte = proc['Committee responsible Date announced in plenary'].split()[0] date = datetime.strptime(proc['Date adopted'], "%d.%m.%Y") res_resp = { 'proc': proc, 'date': date, 'committee': cmte, 'votes': {}, } res['responsible'].append(res_resp) for table in responsible.xpath('../../following-sibling::tr/td/p/table'): vote = extract_table(table, url, date) res_resp['votes'][vote['type']] = vote del (vote['type']) return res
def scrape_events(tree): res=[] for item in lst2obj((tree.xpath('//table[@id="key_events"]') or [None])[0],eventFields): if item.get('text'): try: summary=fetch(item['text']['url']) except: continue item['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')] res.append(item) return res
def getMEPDeclarations(id): try: dom = fetch("http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500]) except Exception as e: logger.error("mepdeclaration %s" % e) return [] dif_links = dom.xpath('//h3[@id="sectionDIF"]/following-sibling::div//ul[@class="link_collection_noborder"]//a[@class="link_pdf"]/@href') dat_links = dom.xpath('//h3[@id="sectionDAT"]/following-sibling::div//ul[@class="link_collection_noborder"]//a[@class="link_pdf"]/@href') if not dif_links: logger.warn('[!] no declaration data http://www.europarl.europa.eu/meps/en/%s/_declarations.html' % id) return dif_links, dat_links
def crawl(year, term, **kwargs): listurl = 'http://www.europarl.europa.eu/plenary/en/minutes.html' PARAMS = '?clean=false&leg=%s&refSittingDateStart=01/01/%s&refSittingDateEnd=31/12/%s&miType=title&miText=Roll-call+votes&tabActif=tabResult' params = PARAMS % (term, year, year) root=fetch(listurl+params) prevdates=None dates=root.xpath('//span[@class="date"]/text()') i=0 while dates and dates!=prevdates: for date in dates: if not date.strip(): continue #print(term, date.strip()) date = datetime.strptime(date.strip(), "%d-%m-%Y").strftime("%Y-%m-%d") payload = dict(kwargs) payload['term'] = term payload['date'] = date add_job('pvote', payload=payload) i+=1 root=fetch("%s%s&action=%s" % (listurl,params,i)) prevdates=dates dates=root.xpath('//span[@class="date"]/text()')
def checkUrl(url): if not url: return False if url in seenurls: return seenurls[url] try: res=fetch(url) except Exception as e: #print >>sys.stderr, "[!] checkurl failed in %s\n%s" % (url, e) seenurls[url]=False else: seenurls[url]=(res.xpath('//h1/text()') or [''])[0]!="Not available in English." return seenurls[url]
def scrape(url, committee, **kwargs): comid = committee root = fetch(url) lines = [ x for x in root.xpath('//td[@class="contents"]/div/*') if unws(' '.join(x.xpath('.//text()'))) ] lines = [ x for x in lines if unws(' '.join(x.xpath('.//text()'))) not in ['<EPHeader>', '</EPHeader>'] ] if not len(lines): return if not unws(' '.join(lines[2].xpath('.//text()'))) in [ 'DRAFT AGENDA', '<TitreType> DRAFT AGENDA </TitreType>' ]: log( 3, "not DRAFT AGENDA %s in %s" % (unws(' '.join(lines[2].xpath('.//text()'))), url)) agenda = { u'committee': comid, u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))), u'src': url, } i = 1 if unws(' '.join(lines[3].xpath( './/text()'))) == "INTERPARLIAMENTARY COMMITTEE MEETING": log(2, "skipping interparl com meet") return if len(lines) >= 7 and unws(' '.join( lines[6].xpath('.//text()'))).startswith('Room'): agenda.update({ u'docid': unws(' '.join(lines[1].xpath('.//text()'))), u'type': unws(' '.join(lines[3].xpath('.//text()'))), u'time': toTime(unws(' '.join(lines[4].xpath('.//text()')))), u'city': unws(' '.join(lines[5].xpath('.//text()'))), u'room': unws(' '.join(lines[6].xpath('.//text()')))[6:], }) i = 7 itemcnt = 0 item = {} schedule = None res = [] while i < len(lines): line = lines[i] i += 1 txt = unws(' '.join(line.xpath('.//text()'))) if txt in ['* * *', '***']: continue # skip end of schedule block # 20 December 2011, 16.00 – 16.30 tmp = toTime(txt) if tmp: schedule = tmp if i < len(lines) and unws(' '.join( lines[i].xpath('.//text()'))) == 'In camera': schedule[u'incamera'] = True i += 1 continue if line.tag == 'div': item[u'actors'] = getactors(line) continue firsttoken = txt.split()[0] # 6. Alternative dispute resolution for consumer disputes and # amending Regulation (EC) No 2006/2004 and Directive # 2009/22/EC (Directive on consumer ADR) if firsttoken[-1] == '.' and firsttoken[:-1].isdigit( ) and itemcnt + 1 == int(firsttoken[:-1]): if item: res.append(item) itemcnt += 1 item = copy.deepcopy(agenda) item.update({ u'title': ' '.join(txt.split()[1:]), u'seq_no': itemcnt, }) if schedule: item.update(schedule) continue # trailing list of "details" # · Presentation by the Commission of the proposal & Impact Assessment # · Exchange of views if firsttoken == u"·": if not 'list' in item: item[u'list'] = [] tmp = ' '.join(txt.split()[1:]) if tmp.startswith('Deadline for tabling amendments:'): try: item[u'tabling_deadline'] = datetime.strptime( tmp.split(':')[1].strip(), "%d %B %Y, %H.%M") except ValueError: try: item[u'tabling_deadline'] = datetime.strptime( tmp.split(':')[1].strip(), "%d.%m.%Y at %H.%M") except: log( 2, '[$] unknown tabling deadline format %s' % unws(tmp)) item[u'list'].append(tmp) continue # committee dossier # IMCO/7/08130 if txt.startswith("%s/7/" % comid) and len(txt) == 12: item[u'comdossier'] = txt continue # ***I 2011/0373(COD) COM(2011)0793 – C7-0454/2011 tmp = getdocs(txt) if tmp: item.update(tmp) continue # fall-through line log(4, "(falltrough) %s %s" % (line.tag, txt.encode('utf8'))) if item: res.append(item) save(res) return res
def parse_history(id, root, mep): for term in root.xpath( '//div[@id="sectionsNavPositionInitial"]//div[@class="erpl_side-navigation"]/div/ul/li//span[text()="History of parliamentary service"]/../following-sibling::div//ul/li//a/span[@class="t-x"]/text()' ): if not term.endswith("parliamentary term"): log( 2, 'history menu item does not end as expected with "parliamentary term": %s http://www.europarl.europa.eu/meps/en/%s/name/declarations' % (term, id)) raise ValueError #continue term = int(term[0]) if (id, term) in {(124870, 9), (129141, 9)}: continue # jeppe kofod, and frans timmermanns never really got started. root = fetch( "http://www.europarl.europa.eu/meps/en/%s/name/history/%s" % (id, term)) body = root.xpath('//div[@id="status"]')[0] for title in body.xpath('.//h4'): key = unws(''.join(title.xpath('.//text()'))) if key in [None, '']: log( 2, "empty history section http://www.europarl.europa.eu/meps/en/%s/name/history/%s" % (id, term)) raise ValueError #continue #mep[key] = [] for item in title.xpath('./following-sibling::ul/li'): interval = unws(''.join(item.xpath('./strong/text()'))) post = item.xpath('./strong/following-sibling::text()')[0][3:] if key in ["National parties", "Constituencies"]: key = 'Constituencies' # parse date interval try: start, end = parse_hist_date(interval) except: log( 1, "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s" % (interval, id, term)) raise ValueError #continue # parse party and country cstart = post.rfind(' (') if post[cstart + 2:-1] in SEIRTNUOC: country = post[cstart + 2:-1] party = post[:cstart] else: log( 2, '%s unknown country: %s' % (id, post[cstart + 2:-1])) raise ValueError party = 'unknown' country = 'unknown' if not key in mep: mep[key] = [] mep[key].append({ u'party': party, u'country': country, u'start': start, u'end': end, 'term': term }) if end == datetime.strptime("31.12.9999", u"%d.%m.%Y"): mep['active'] = True elif key in [ 'Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor', 'Substitute observer' ]: # memberships in various committees, delegations and EP mgt try: start, end = parse_hist_date(interval) except: log( 2, "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s" % (interval, id, term)) raise ValueError #continue item = { u'role': key, u'Organization': unws(post), u'start': start, u'end': end, u'term': term, } for start, field in ORGMAPS: if item['Organization'].startswith(start): if field == 'Committees': if item['Organization'] in COMMITTEE_MAP: item[u'abbr'] = COMMITTEE_MAP[ item['Organization']] else: log( 5, "no abbr found for committee: %s" % item['Organization']) if field == 'Delegations': if item['Organization'] in DELEGATIONS: item[u'abbr'] = DELEGATIONS[ item['Organization']] else: log( 5, "no abbr found for delegation: %s" % item['Organization']) if not field in mep: mep[field] = [] mep[field].append(item) break elif key == u'Political groups': try: start, end = parse_hist_date(interval) except: log( 1, "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s" % (interval, id, term)) raise ValueError #continue tmp = post.split(u' - ') if len(tmp) > 1: org = ' - '.join(tmp[:-1]) role = tmp[-1] elif post.endswith(' -'): org = post[:-2] role = '' elif post in ['Non-attached Members', 'Non-attached']: org = post role = 'Member' else: log( 2, '[!] political group line "%s", http://www.europarl.europa.eu/meps/en/%s/name/history/%s' % (post, id, term)) raise ValueError #continue if not u'Groups' in mep: mep[u'Groups'] = [] if not org in GROUP_MAP: log(5, "no groupid found for group: %s" % org) mep[u'Groups'].append({ u'role': role, u'Organization': org, # u'country': country, # this value is missing from the latest EP website u'groupid': GROUP_MAP.get(org, org), u'start': start, u'end': end, }) else: log( 2, '[!] unknown field "%s" http://www.europarl.europa.eu/meps/en/%s/name/history/%s' % (key, id, term)) raise ValueError # reorder historical lists in ascending order, so new entries are appended and don't mess up the diffs for k in ('Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff'): if not k in mep: continue mep[k] = [ e for e in sorted(mep[k], key=lambda x: (x['start'], x[ 'end'], x.get('Organization', x.get('party')))) ]
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = { u'active': False, u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)), u'meta': {u'url': url} } mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()')))) borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()') if len(borntxt)>0: if unws(borntxt[-1]).startswith('Date of death:'): try: data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y") except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) tmp = borntxt[-2].split(',', 1) else: tmp = borntxt[-1].split(',', 1) if len(tmp)==2: (d, p) = tmp else: d,p = tmp[0], None try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")} except ValueError: logger.warn(traceback.format_exc()) finally: if p: if 'Birth' in data: data[u'Birth'][u'place'] = unws(p) else: data[u'Birth'] = unws(p) else: logger.warn('[!] no birth data %s' % url) # scrape stuff from right column addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')]) addif(data,u'Homepage',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')]) addif(data,u'Twitter',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')]) addif(data,u'Facebook',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')]) addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1] for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')]) # contact information for span in root.xpath('//div[@id="content_right"]//h3'): title=unws(''.join(span.xpath('.//text()'))) if title == "Contacts": addif(data,u'Addresses',getAddress(span)) # scrape main content for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'): key=unws(''.join(section.xpath('.//text()'))) if key=="National parties": # constituencies key='Constituencies' for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, party = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if not key in data: data[key]=[] if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" cstart = party.rfind(' (') if party[cstart+2:-1] in SEIRTNUOC: country = party[cstart+2:-1] party = party[:cstart] else: logger.warn('unknown country: %s' % party[cstart+2:-1]) country='unknown' #print etree.tostring(constlm, pretty_print=True) data[key].append({ u'party': party, u'country': country, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor', 'Substitute observer']: # memberships in various committees, delegations and EP mgt for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, org = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" item={u'role': key, u'abbr': COMMITTEE_MAP.get(org), u'Organization': org, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), } for start, field in orgmaps: if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break elif key == u'Political groups': for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) interval, org = line.split(' : ',1) tmp = org.split(u' - ') if len(tmp)>1: org = ' - '.join(tmp[:-1]) role = tmp[-1] elif org.endswith(' -'): org=org[:-2] role='' else: logger.error('[!] political group line %s' % line) continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" if not u'Groups' in data: data[u'Groups']=[] data[u'Groups'].append( {u'role': role, u'Organization': org, u'country': COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))), u'groupid': group_map[org], u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) else: logger.error('[!] unknown field %s' % key) # sort all lists in descending order for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']: if not fld in data: continue data[fld]=sorted(data[fld], key=lambda x: x.get('end',x['start']), reverse=True) # get CV - page (is on separate http path :/) cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid root = fetch(cvurl, ignore=[500]) data[u'CV']={} for sec in root.xpath('//h3[@class="collapsible"]'): section=unws(''.join(sec.xpath('.//text()'))) data[u'CV'][section]=[] for line in sec.xpath('./following-sibling::div[1]//li'): data[u'CV'][section].append(unws(''.join(line.xpath('.//text()')))) # get assistants also on a separate page :/ assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid root = fetch(assurl, ignore=[500]) for h3 in root.xpath('//h3[@id="section"]'): title=unws(''.join(h3.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower().split()[0], [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower(), [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) return data
def scrape(id, **kwargs): # we ignore the /meps/en/<id>/<name>/home path, since we can get all info also from other pages url = "http://www.europarl.europa.eu/meps/en/%s/name/cv" % id xml = fetch_raw(url) # we have to patch up the returned html... xml = xml.replace("</br>", "<br/>") # ...it contains some bad tags.. root = fromstring( xml ) # ...which make the lxml soup parser drop some branches in the DOM sidebar_check(root, url) mep = { 'UserID': id, 'Name': mangleName( unws(' '.join( root.xpath('//span[@class="sln-member-name"]/text()'))), id), 'Photo': "https://www.europarl.europa.eu/mepphoto/%s.jpg" % id, 'meta': { 'url': url }, 'Twitter': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Twitter"]/@href' ) ], 'Homepage': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Website"]/@href' ) ], 'Facebook': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Facebook"]/@href' ) ], 'Instagram': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Instagram"]/@href' ) ], 'Mail': [ deobfus_mail(x) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="E-mail"]/@href' ) ], 'Addresses': parse_addr(root), 'active': False, } mep = addchangednames(mep) birthdate = root.xpath('//time[@id="birthDate"]/text()') if len(birthdate) > 0: mep['Birth'] = { 'date': datetime.strptime(unws(birthdate[0]), u"%d-%m-%Y") } place = root.xpath('//time[@id="birthDate"]/following-sibling::text()') if len(place) > 0: tmp = unws(' '.join(place)) if tmp.startswith(", "): tmp = tmp[2:] mep['Birth']['place'] = tmp death = root.xpath('//time[@id="deathDate"]/text()') if death: mep['Death'] = datetime.strptime(unws(death[0]), u"%d-%m-%Y") body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if body.xpath('.//h1[text()="Curriculum vitae "]'): if not body.xpath('.//h3[@id="no_cv_available"]'): mep['CV'] = { 'updated': datetime.strptime( unws( body.xpath( './/p[@class="small"]/strong[contains(text(),"Updated: ")]/text()' )[0]), u"Updated: %d/%m/%Y") } mep['CV'].update({ unws(''.join(title.xpath(".//text()"))): [ unws(''.join(item.xpath(".//text()"))).replace( "-...", "- ...") for item in title.xpath("following-sibling::ul/li") ] for title in body.xpath('.//h4') #if not unws(''.join(title.xpath(".//text()"))).startswith("Original version : ") }) # assistants url = "http://www.europarl.europa.eu/meps/en/%s/name/assistants" % id root = fetch(url) body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if unws(' '.join(body.xpath(".//h1/text()"))) == "Assistants": for h4 in body.xpath('.//h4'): title = unws(''.join(h4.xpath(".//text()"))) assistants = [ unws(''.join(item.xpath(".//text()"))) for item in h4.xpath("../div//span") ] if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in mep: mep['assistants'] = {} title = title.lower().split()[0] if assistants: mep['assistants'][title] = assistants elif title in [ 'Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', 'Trainees', 'Paying agents (grouping)', 'Paying agents', 'Assistants to the Vice-Presidency/to the Quaestorate' ]: if not 'assistants' in mep: mep['assistants'] = {} title = title.lower() if assistants: mep['assistants'][title] = assistants else: log(2, 'unknown title for assistants "{}" {}'.format(title, url)) raise ValueError # declarations root = fetch("http://www.europarl.europa.eu/meps/en/%s/name/declarations" % id) body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if unws(' '.join(body.xpath(".//h1/text()"))) == "Declarations": for title in body.xpath('.//h4'): key = unws(''.join(title.xpath('.//text()'))) if key == 'Declaration of financial interests': key = 'Financial Declarations' mep[key] = [] for pdf in title.xpath('./following-sibling::ul/li/a'): url = pdf.xpath('./@href')[0] try: mep[key].append(findecl.scrape(url)) except: log(1, "failed to extract findecl from %s" % url) elif key == 'Declarations of participation by Members in events organised by third parties': key = 'Declarations of Participation' mep[key] = [] for pdf in title.xpath( './following-sibling::ul/li/a' )[:: -1]: # reversed order, otherwise newer ones get prepended and mess up the diff url = pdf.xpath('./@href')[0] name = unws(''.join(pdf.xpath('.//text()'))) mep[key].append({'title': name, 'url': url}) elif key in [ 'Declaration of good conduct', 'Voluntary confirmation on the use of the General Expenditure Allowance' ]: mep[key] = [] for pdf in title.xpath( './following-sibling::ul/li/a' )[:: -1]: # reversed order, otherwise newer ones get prepended and mess up the diff url = pdf.xpath('./@href')[0] name = unws(''.join(pdf.xpath('.//text()'))) mep[key].append({'title': name, 'url': url}) else: log( 2, 'unknown type of declaration: "%s" http://www.europarl.europa.eu/meps/en/%s/name/declarations' % (key, id)) key = None raise ValueError # history parse_history(id, root, mep) process(mep, id, db.mep, 'ep_meps', mep['Name']['full'], nopreserve=(['Addresses'], ['assistants']), onchanged=onchanged) if __name__ == '__main__': return mep del mep
def scrape(id, terms, mepname, **kwargs): activity_types = ( ('plenary-speeches', 'CRE'), ('reports', "REPORT"), ('reports-shadow', "REPORT-SHADOW"), ('opinions', "COMPARL"), ('opinions-shadow', "COMPARL-SHADOW"), ('motions-instit', "MOTION"), ('oral-questions', "OQ"), # other activities ('written-explanations', 'WEXP'), ('major-interpellations', 'MINT'), ('written-questions', "WQ"), ('motions-indiv', "IMOTION"), ('written-declarations', "WDECL"), ) activities = {} for type, TYPE in activity_types: for term in terms: page = 0 cnt = 20 url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % ( id, type, term, page, cnt) try: root = fetch(url) except: log(1, "failed to fetch {}".format(url)) raise ValueError #continue #print(url, file=sys.stderr) while (len(root.xpath('//div[@class="erpl_document"]')) > 0): for node in root.xpath('//div[@class="erpl_document"]'): if type == 'written-explanations': item = { 'title': unws(''.join( node.xpath( './div/h3/span[@class="t-item"]//text()')) ), 'date': datetime.strptime( node.xpath('./div[1]/div[1]/span[1]/text()') [0], u"%d-%m-%Y"), 'text': unws(''.join(node.xpath('./div[2]/div//text()'))) } elif type == 'written-declarations': if len(node.xpath('./div[1]/div')) != 3: log( 2, "written decl item has not 3 divs but %d %s" % (len(node.xpath('./div[1]/div')), url)) continue if len(node.xpath('./div[1]/div[1]/span')) != 3: log( 2, "written decl item has not 3 but %d spans in the 1st div at %s" % (len(node.xpath('./div[1]/div[1]/span')), url)) continue item = { 'title': unws(''.join( node.xpath( './div/h3/span[@class="t-item"]//text()')) ), 'date': datetime.strptime( node.xpath('./div[1]/div[1]/span[1]/text()') [0], u"%d-%m-%Y"), 'id': unws(''.join( node.xpath('./div[1]/div[1]/span[2]/text()') [0])), 'status': unws(''.join( node.xpath('./div[1]/div[1]/span[3]/text()') [0])), 'formats': [{ 'type': unws(fnode.xpath('./span/text()')[0]), 'url': str(fnode.xpath('./@href')[0]), 'size': unws(fnode.xpath('./span/span/text()')[0]) } for fnode in node.xpath( './div[1]/div[2]/div[@class="d-inline"]/a')], 'authors': [{ 'name': name.strip(), "mepid": db.mepid_by_name(name.strip()) } for name in node.xpath( './div[1]/div[3]/span/text()')], } for info in node.xpath('./div[2]/div'): label = unws(''.join(info.xpath('./text()')))[:-2] value = unws(''.join(info.xpath('./span/text()'))) if 'date' in label.lower(): value = datetime.strptime(value, u"%d-%m-%Y") if label == 'Number of signatories': number, date = value.split(' - ') value = int(number) item["No of sigs date"] = datetime.strptime( date, u"%d-%m-%Y") item[label] = value else: #from lxml.etree import tostring #print('\n'.join(tostring(e).decode() for e in node.xpath('./div/div[1]'))) # all other activities share the following scraper ref = unws(''.join( node.xpath('./div[1]/div[1]/span[2]/text()'))) if ref.startswith('- '): ref = ref[2:] if ref.endswith(' -'): ref = ref[:-2] item = { 'date': datetime.strptime( node.xpath('./div[1]/div[1]/span[1]/text()') [0], u"%d-%m-%Y"), 'reference': ref, } if type not in ['written-questions', 'oral-questions']: ref = unws(''.join( node.xpath('./div[1]/div[1]/span[3]/text()'))) if ref: if not pere.match(ref): log( 2, "pe, has not expected format: '%s'" % ref) else: item['pe'] = ref # opinions don't have title urls... why would they? refurl = node.xpath('./div[1]/h3/a/@href') if refurl: item['url'] = str(refurl[0]) item['title'] = unws(''.join( node.xpath( './div/h3//span[@class="t-item"]//text()'))) abbr = node.xpath( './div[1]/div[1]/span/span[contains(concat(" ",normalize-space(@class)," ")," erpl_badge-committee ")]/text()' ) if len(abbr): item['committee'] = [ a for a in [unws(c) for c in abbr] if a ] formats = [] for fnode in node.xpath( './div[1]/div[2]/div[@class="d-inline"]/a'): elem = { 'type': unws(fnode.xpath('./span/text()')[0]), 'url': str(fnode.xpath('./@href')[0]) } tmp = fnode.xpath('./span/span/text()') if len(tmp) > 0: elem['size'] = unws(tmp[0]) formats.append(elem) if formats: item['formats'] = formats authors = [{ 'name': name.strip(), "mepid": db.mepid_by_name(name.strip()) } for name in node.xpath('./div[1]/div[3]/span/text()') ] if authors: item['authors'] = authors if type in ['opinions-shadow', 'opinions']: for f in item['formats']: if f['type'] == 'PDF': ref = pdf2ref(f['url']) if ref is not None: item['dossiers'] = [ref] break else: # try to deduce dossier from document reference dossiers = db.get('dossiers_by_doc', item['reference']) or [] if len(dossiers) > 0: item['dossiers'] = [ d['procedure']['reference'] for d in dossiers ] elif not '+DOC+PDF+' in item['url']: # try to figure out the associated dossier by making an (expensive) http request to the ep log( 4, "fetching primary activity page %s" % item['url']) try: refroot = fetch(item['url']) except: refroot = None if refroot is not None: if '/doceo/' in item[ 'url']: # stupid new EP site removed the span with the procedure, bastards. fulla = refroot.xpath( '//table[@class="buttondocwin"]//a/img[@src="/doceo/data/img/navi_moredetails.gif"]/..' ) if fulla: fullurl = fulla[0].get('href') if fullurl.endswith('.html'): if fullurl[-7:-5] != 'EN': fullurl = fullurl[:-7] + 'EN.html' log( 4, 'loading activity full text page %s' % fullurl) if fullurl.startswith( '/doceo'): fullurl = 'https://www.europarl.europa.eu' + fullurl if fullurl != item['url']: refroot = fetch(fullurl) else: log( 4, 'no fulla for %s' % item['url']) anchor = refroot.xpath( '//span[@class="contents" and text()="Procedure : " and not(ancestor::div[@style="display:none"])]' ) if len(anchor) == 1: dossier = anchor[0].xpath( "./following-sibling::a/text()") if len(dossier) == 1: item['dossiers'] = [ unws(dossier[0]) ] elif len(dossier) > 1: log( 2, "more than one dossier in ep info page: %d %s" % (len(dossier), item['url'])) elif len(anchor) > 1: log( 2, "more than one anchor in ep info page: %d %s" % (len(anchor), item['url'])) item['term'] = term if TYPE not in activities: activities[TYPE] = [] activities[TYPE].append(item) if len(root.xpath('//div[@class="erpl_document"]')) < cnt: break page += 1 url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % ( id, type, term, page, cnt) try: root = fetch(url) except: log(1, "failed to fetch {}".format(url)) #raise ValueError break #print(url, file=sys.stderr) if TYPE in activities: activities[TYPE] = sorted(activities[TYPE], key=lambda x: x['date']) activities['mep_id'] = id if len(activities.keys()) > 1: process(activities, id, db.activities, 'ep_mep_activities', mepname, nodiff=True) return activities return {}