def getComAms(leg=TERM, update=False): #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html" urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html" # todo add to searchRPCD, OPCD for doctype in ['AMCO', 'RPCD', 'OPCD']: postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype) nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']): url=urltpl % (com) i=0 logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com)) root=fetch(url, params=postdata) prev=[] while True: logger.info("%s %s" % (datetime.now().isoformat(), url)) #logger.info(tostring(root)) tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None for a in root.xpath('//a[@title="open this PDF in a new window"]') if (len(a.get('href',''))>13)} if not tmp or prev==tmp: break prev=tmp for u, v in sorted(tmp.items()): if db.ep_ams.find_one({'src': u}): continue yield u, v if update: break i+=1 url=nexttpl % (com,i) root=fetch(url)
def getComAgendas(): #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html" urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html" postdata="docType=AGEN&leg=8&miType=text&tabActif=tabResult#sidesForm" #nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm" nexttpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html?action=%s&tabActif=tabResult#sidesForm" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']): url=urltpl % (com) i=0 agendas=[] logger.info('scraping %s' % com) root=fetch(url, params=postdata) prev=[] while True: logger.info("%s %s" % (datetime.now().isoformat(), url)) tmp=[(a.get('href'), unws(a.xpath('text()')[0])) for a in root.xpath('//p[@class="title"]/a') if len(a.get('href',''))>13] if not tmp or prev==tmp: break prev=tmp for u,title in tmp: if title.startswith('DRAFT AGENDA'): yield (u,com) i+=1 url=nexttpl % (com,i) root=fetch(url)
def getIncomming(term=7): # returns dict of new incoming meps. this is being checked when # crawling, to set more accurate groups and constituency info i=0 page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=in') last=None res={} while True: meps=[((u'name', unws(x.xpath('text()')[0])), (u'meta', {u'url': urljoin(urljoin(BASE_URL,x.get('href')),'get.html')}), (u'Constituencies', {u'start': datetime.strptime(unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0]), "%B %d, %Y"), u'country': unws((x.xpath('..//span[@class="ep_country"]/text()') or [''])[0])}), (u'Groups', {u'start': datetime.strptime(unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0]), "%B %d, %Y"), u'group': unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0]), u'groupid': group_map[unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0])], u'role': unws((x.xpath('..//span[@class="ep_group"]/span[@class="ep_title"]/text()') or [''])[0])}), ) for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')] if meps==last: break last=meps for mep in meps: res[int(mep[1][1]['url'].split('/')[-2])]=dict(mep[1:]) i+=1 page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=in&filter=' % (i, term)) return res
def getComAms(leg=7, update=False): urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html" # todo add to searchRPCD, OPCD for doctype in ['AMCO', 'RPCD', 'OPCD']: postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype) nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']): url=urltpl % (com) i=0 logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com)) root=fetch(url, params=postdata) prev=[] while True: logger.info("%s %s" % (datetime.now().isoformat(), url)) #logger.info(tostring(root)) tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None for a in root.xpath('//a[@title="open this PDF in a new window"]') if (len(a.get('href',''))>13)} if not tmp or prev==tmp: break prev=tmp for u, v in sorted(tmp.items()): if db.ep_ams.find_one({'src': u}): continue yield u, v if update: break i+=1 url=nexttpl % (com,i) root=fetch(url)
def getInOut(term=current_term, dir="in", res={}): # returns dict of new incoming meps. this is being checked when # crawling, to set more accurate groups and constituency info i = 0 page = fetch("http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=%s" % dir, ignore=[500]) last = None while True: meps = [] for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]'): mepid = int(urljoin(BASE_URL, x.get("href")).split("/")[-2]) const = {u"country": unws((x.xpath('..//span[@class="ep_country"]/text()') or [""])[0])} if dir == "out": const["start"], const["end"] = [ datetime.strptime(d, "%B %d, %Y") for d in unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [""])[0]).split(" - ") ] else: const["start"] = datetime.strptime( unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [""])[0]), "%B %d, %Y" ) if not mepid in res: res[mepid] = [const] else: res[mepid].append(const) meps.append((mepid, const)) if meps == last: break last = meps i += 1 page = fetch( "http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=%s&filter=" % (i, term, dir), ignore=[500], ) return res
def getOutgoing(term=current_term): # returns an iter over ex meps from the current term, these are # missing from the get_meps result global newbies i = 0 page = fetch( 'http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=out', ignore=[500]) last = None while True: meps = [( (u'url', urljoin(BASE_URL, x.get('href'))), (u'name', unws(x.xpath('text()')[0])), ('dates', unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0])), ('country', unws((x.xpath('../span[@class="ep_country"]/text()') or [''])[0])), ('group', unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0])), ('role', unws((x.xpath( '..//span[@class="ep_group"]/span[@class="ep_title"]/text()') or [''])[0])), ) for x in page.xpath( '//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')] if meps == last: break last = meps for mep in meps: mep = dict(mep) tmp = mep['dates'].split(' - ') if tmp: mep[u'Constituencies'] = [{ u'start': datetime.strptime(tmp[0], "%B %d, %Y"), u'end': datetime.strptime(tmp[1], "%B %d, %Y"), u'country': mep['country'] }] mep[u'Groups'] = [{ u'Organization': mep['group'], u'role': mep['role'] }] del mep['dates'] del mep['country'] del mep['group'] del mep['role'] newbies[int(mep['url'].split('/')[-2])] = mep yield (urljoin(urljoin(BASE_URL, mep['url']), 'get.html'), mep) i += 1 page = fetch( 'http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=out&filter=' % (i, term), ignore=[500])
def get_all_dossiers(): for year in xrange(datetime.date.today().year, 1972, -1): tree=fetch('http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/????\(*\)' % (year)) count=int(tree.xpath('//span[@class="ep_title resultNum pdfHide"]/text()')[0].strip()[len('Results found: '):]) tree=fetch('http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&limit=%s&noHeader=false&q=objectReferenceN:N-%s/????\(*\)' % (count,year)) links=tree.xpath('//a[@class="reference rssEntry_id rssEntry_title rssEntry_updated"]') for link in links: yield (urljoin(BASE_URL,link.get('href')), (link.xpath('text()') or [''])[0])
def get_meps(term='7'): i=0 page=fetch("http://www.europarl.europa.eu/meps/en/performsearch.html?webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=ALL&bodyValue=&type=&filter=&search=Show+result" % (term)) last=None while True: meps=[(x.get('href'), unws(x.xpath('text()')[0])) for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')] if meps==last: break for url,name in meps: yield (urljoin(urljoin(BASE_URL,url),'get.html'), name) last=meps i+=1 page=fetch("http://www.europarl.europa.eu/meps/en/performsearch.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=ALL&bodyValue=&type=&filter=" % (i, term))
def getDates(params): root = fetch(URL, params=params) #print tostring(root) prevdates = None dates = root.xpath('//span[@class="date"]/text()') i = 10 while dates and dates != prevdates: for date in dates: yield datetime.strptime(date, "%d-%m-%Y").strftime("%Y%m%d") root = fetch(URL, params="%s&startValue=%s" % (params, i)) prevdates = dates i += 10 dates = root.xpath('//span[@class="date"]/text()')
def getDates(params): root=fetch(URL, params=params) #print tostring(root) prevdates=None dates=root.xpath('//span[@class="date"]/text()') i=10 while dates and dates!=prevdates: for date in dates: yield datetime.strptime(date.strip(), "%d-%m-%Y").strftime("%Y%m%d") root=fetch(URL, params="%s&startValue=%s" % (params,i)) prevdates=dates i+=10 dates=root.xpath('//span[@class="date"]/text()')
def getmeps(query='current'): if query=='unlisted': for mep in unlisted: yield mep elif query=='all': for letter in xrange(26): tmp=meplists[query] a=ord('A') root=fetch(tmp%chr(a+letter), ignore=[500]) for meplm in root.xpath('//id/text()'): yield int(meplm) else: root=fetch(meplists[query], ignore=[500]) for meplm in root.xpath('//id/text()'): yield int(meplm)
def get_all_dossiers(): for year in xrange(datetime.date.today().year, 1972, -1): tree = fetch( 'http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/????\(*\)' % (year)) count = int( tree.xpath('//span[@class="resultNumber"]/text()')[0].strip()) tree = fetch( 'http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&limit=%s&noHeader=false&q=objectReferenceN:N-%s/????\(*\)' % (count, year)) links = tree.xpath( '//a[@class="reference rssEntry_id rssEntry_title rssEntry_updated"]' ) for link in links: yield (urljoin(BASE_URL, link.get('href')), (link.xpath('text()') or [''])[0])
def checkUrl(url): if not url: return False try: res = fetch(url) except Exception, e: #print >>sys.stderr, "[!] checkurl failed in %s\n%s" % (url, e) return False
def scrape_docs(tree): res = [] docs = tree.xpath('//table[@id="doc_gateway"]') tabs = [x.xpath("preceding-sibling::h2")[0].xpath("text()")[0] for x in docs] for inst, table in izip(tabs, docs): if inst in instmap.keys(): for doc in lst2obj(table, docFields): if inst != "Other institutions": doc[u"body"] = instmap[inst] else: try: doc[u"body"] = otherinst[doc["type"].split(":")[0]] except KeyError: doc[u"body"] = "" if ( doc["body"] in ["EP", "CSL"] and doc["type"] == "Joint text approved by Conciliation Committee co-chairs" ): # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :) continue if doc.get("text"): try: summary = fetch(doc["text"]["url"]) except: continue doc[u"text"] = [tostring(x) for x in summary.xpath('//div[@id="summary"]')] res.append(doc) elif inst != "All documents": print "[!] unrecognized tab in documents", inst return res
def scrape_docs(tree): res=[] docs=tree.xpath('//table[@id="doc_gateway"]') tabs=[x.xpath('preceding-sibling::h2')[0].xpath('text()')[0] for x in docs] for inst, table in izip(tabs, docs): if inst in instmap.keys(): for doc in lst2obj(table, docFields): if inst != 'Other institutions': doc[u'body']=instmap[inst] else: try: doc[u'body']=otherinst[doc['type'].split(':')[0]] except KeyError: doc[u'body']='' if doc['body'] in ['EP','CSL'] and doc['type']=='Joint text approved by Conciliation Committee co-chairs': # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :) continue if doc.get('text'): try: summary=fetch(doc['text']['url']) except: continue doc[u'text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')] res.append(doc) elif inst != 'All': logger.warn(u"[!] unrecognized tab in documents %s" % inst) return res
def checkUrl(url): if not url: return False try: res=fetch(url) except Exception, e: #print >>sys.stderr, "[!] checkurl failed in %s\n%s" % (url, e) return False
def scrape_docs(tree): res = [] docs = tree.xpath('//table[@id="doc_gateway"]') tabs = [ x.xpath('preceding-sibling::h2')[0].xpath('text()')[0] for x in docs ] for inst, table in izip(tabs, docs): if inst in instmap.keys(): for doc in lst2obj(table, docFields): if inst != 'Other institutions': doc[u'body'] = instmap[inst] else: try: doc[u'body'] = otherinst[doc['type'].split(':')[0]] except KeyError: doc[u'body'] = '' if doc['body'] in ['EP', 'CSL'] and doc[ 'type'] == 'Joint text approved by Conciliation Committee co-chairs': # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :) continue if doc.get('text'): try: summary = fetch(doc['text']['url']) except: continue doc[u'text'] = [ tostring(x) for x in summary.xpath('//div[@id="summary"]') ] res.append(doc) elif inst != 'All documents': print "[!] unrecognized tab in documents", inst return res
def getMEPGender(id): try: mepraw = fetch("http://www.europarl.europa.eu/meps/fr/%s/get.html" % (id), ignore=[500]) except Exception, e: logger.error("mepgender %s" % e) return 'n/a'
def checkUrl(url): if not url: return False if url in seenurls: return seenurls[url] try: res=fetch(url) except Exception, e: #print >>sys.stderr, "[!] checkurl failed in %s\n%s" % (url, e) seenurls[url]=False
def checkUrl(url): if not url: return False if url in seenurls: return seenurls[url] try: res = fetch(url) except Exception, e: #print >>sys.stderr, "[!] checkurl failed in %s\n%s" % (url, e) seenurls[url] = False
def getMEPDeclarations(id): try: dom = fetch( "http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500]) except Exception, e: logger.error("mepdeclaration %s" % e) return []
def scrape_events(tree): res=[] for item in lst2obj((tree.xpath('//table[@id="key_events"]') or [None])[0],eventFields): if item.get('text'): try: summary=fetch(item['text']['url']) except: continue item['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')] res.append(item) return res
def crawl(year, term): listurl = 'http://www.europarl.europa.eu/plenary/en/minutes.html' PARAMS = 'clean=false&leg=%s&refSittingDateStart=01/01/%s&refSittingDateEnd=31/12/%s&miType=title&miText=Roll-call+votes&tabActif=tabResult' voteurl = 'http://www.europarl.europa.eu/RegData/seance_pleniere/proces_verbal/%s/votes_nominaux/xml/P%s_PV%s(RCV)_XC.xml' params = PARAMS % (term, year, year) root=fetch(listurl, params=params) prevdates=None dates=root.xpath('//span[@class="date"]/text()') i=10 while dates and dates!=prevdates: for date in dates: if not date.strip(): continue date = datetime.strptime(date.strip(), "%d-%m-%Y") yield voteurl % (date.strftime("%Y/%m-%d"), term, date.strftime("(%Y)%m-%d")) root=fetch(listurl, params="%s&startValue=%s" % (params,i)) prevdates=dates i+=10 dates=root.xpath('//span[@class="date"]/text()')
def getOutgoing(term=current_term): # returns an iter over ex meps from the current term, these are # missing from the get_meps result i = 0 page = fetch("http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=out", ignore=[500]) last = None while True: meps = [] for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]'): url = urljoin(urljoin(BASE_URL, x.get("href")), "get.html") meps.append(url) yield (url, {}) if meps == last: break last = meps i += 1 page = fetch( "http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=out&filter=" % (i, term), ignore=[500], )
def scrape_events(tree): res = [] for item in lst2obj((tree.xpath('//table[@id="key_events"]') or [None])[0], eventFields): if item.get('text'): try: summary = fetch(item['text']['url']) except: continue item['text'] = [ tostring(x) for x in summary.xpath('//div[@id="summary"]') ] res.append(item) return res
def getOutgoing(term=7): # returns an iter over ex meps from the current term, these are # missing from the get_meps result i=0 page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=out') last=None while True: meps=[((u'url', urljoin(BASE_URL,x.get('href'))), (u'name', unws(x.xpath('text()')[0])), ('dates', unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0])), ('country', unws((x.xpath('../span[@class="ep_country"]/text()') or [''])[0])), ('group', unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0])), ('role', unws((x.xpath('..//span[@class="ep_group"]/span[@class="ep_title"]/text()') or [''])[0])), ) for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')] if meps==last: break last=meps for mep in meps: mep=dict(mep) tmp=mep['dates'].split(' - ') if tmp: mep[u'Constituencies']={u'start': datetime.strptime(tmp[0], "%B %d, %Y"), u'end': datetime.strptime(tmp[1], "%B %d, %Y"), u'country': mep['country']} mep[u'Groups']={u'start': datetime.strptime(tmp[0], "%B %d, %Y"), u'end': datetime.strptime(tmp[1], "%B %d, %Y"), u'group': mep['group'], u'role': mep['role']} del mep['dates'] del mep['country'] del mep['group'] del mep['role'] yield (urljoin(urljoin(BASE_URL,mep['url']),'get.html'), mep) i+=1 page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=out&filter=' % (i, term))
def getComAgendas(): urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text" nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']): url=urltpl % (com) i=0 agendas=[] logger.info('scraping %s' % com) while True: logger.info("crawling %s" % (url)) root=fetch(url) tmp=[(a.get('href'), unws(a.xpath('text()')[0])) for a in root.xpath('//p[@class="title"]/a') if len(a.get('href',''))>13] if not tmp: break for u,_ in tmp: yield (u,com) i+=10 url=nexttpl % (com,i)
def getComAgendas(): urltpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text" nexttpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s" for com in (k for k in COMMITTEE_MAP.keys() if len(k) == 4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']): url = urltpl % (com) i = 0 agendas = [] logger.info('scraping %s' % com) while True: logger.info("crawling %s" % (url)) root = fetch(url) tmp = [(a.get('href'), unws(a.xpath('text()')[0])) for a in root.xpath('//p[@class="title"]/a') if len(a.get('href', '')) > 13] if not tmp: break for u, _ in tmp: yield (u, com) i += 10 url = nexttpl % (com, i)
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/get.html' % userid logger.info("scraping %s" % url) root = fetch(url) data = {u'active': True, 'meta': {u'url': url}} # return {'active': False} mepdiv=root.xpath('//div[@class="ep_elementpeople2"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0])) data[u'Photo'] = unicode(urljoin(BASE_URL,mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')),'utf8') (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split(',', 1) try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), "Born on %d %B %Y"), u'place': unws(p) } except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) const={u'country': unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0])} data[u'Constituencies']=[const] try: const[u'party']=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]), except IndexError: data[u'active']=False else: group=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0]) data[u'Groups'] = [{ u'role': unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]), u'group': group, u'groupid': group_map[group]}] cdiv=root.xpath('//div[@class="ep_elementcontact"]') if len(cdiv): addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')]) addif(data,u'Homepage',[unicode(x.get('href'),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')]) addif(data,u'Mail',[decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))]) for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'): title=unws(''.join(span.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: addif(data,title,[unws(x) for x in span.xpath('../../..//li/div/text()')]) addif(data,u'Addresses',getAddress(root)) for div in root.xpath('//div[@class="ep_content"]'): key=unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()'))) if not len(key): continue elif key.lower()=='curriculum vitae': data[u'CV'] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')] elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President']: for span in div.xpath('.//span[@class="commission_label"]'): item={u'role': key, u'abbr': unws(''.join(span.xpath('text()'))), u'Organization': unws(span.tail)} for start, field in orgmaps: if item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break else: logger.error('[!] unknown field %s' % key) return data
def parseMember(userid): url = "http://www.europarl.europa.eu/meps/en/%s/get.html" % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = {u"active": False, "meta": {u"url": url}} # return {'active': False} mepdiv = root.xpath('//div[@class="ep_elementpeople2"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u"Name"] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0])) data[u"Photo"] = unicode(urljoin(BASE_URL, mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get("src")), "utf8") borntxt = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()') if len(borntxt) > 0: (d, p) = borntxt[0].split(",", 1) try: data[u"Birth"] = {u"date": datetime.strptime(unws(d), u"Born on %d %B %Y"), u"place": unws(p)} except ValueError: logger.warn("[!] failed to scrape birth data %s" % url) logger.warn(traceback.format_exc()) else: logger.warn("[!] no birth data %s" % url) const = {u"country": unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0]), u"start": datetime(2009, 7, 14)} data[u"Constituencies"] = [const] try: data[u"party"] = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]) except IndexError: pass else: group = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0]) try: role = unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]) except IndexError: role = u"Member" data[u"Groups"] = [{u"role": role, u"Organization": group, u"groupid": group_map[group]}] cdiv = root.xpath('//div[@class="ep_elementcontact"]') if len(cdiv): addif( data, u"RSS", [unicode(urljoin(BASE_URL, x.get("href")), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')], ) addif( data, u"Homepage", [unicode(x.get("href"), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')] ) addif( data, u"Mail", [decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))], ) for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'): title = unws("".join(span.xpath(".//text()"))) if title in ["Accredited assistants", "Local assistants"]: if not "assistants" in data: data["assistants"] = {} addif( data["assistants"], title.lower().split()[0], [unws(x) for x in span.xpath("../../..//li/div/text()")] ) addif(data, u"Addresses", getAddress(root)) for div in root.xpath('//div[@class="ep_content"]'): key = unws(u"".join(div.xpath('.//span[@class="ep_title"]/text()'))) if not len(key): continue elif key.lower() == "curriculum vitae": data[u"CV"] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')] elif key in ["Member", "Substitute", "Chair", "Vice-Chair", "Co-President", "President", "Vice-President"]: for span in div.xpath('.//span[@class="commission_label"]'): item = {u"role": key, u"abbr": unws("".join(span.xpath(".//text()"))), u"Organization": unws(span.tail)} for start, field in orgmaps: if item["abbr"] in COMMITTEE_MAP or item["Organization"].startswith(start): if not field in data: data[field] = [] if field == "Committees" and item["Organization"] in COMMITTEE_MAP: item[u"committee_id"] = COMMITTEE_MAP[item["Organization"]] data[field].append(item) break else: logger.error("[!] unknown field %s" % key) return data
def scrape(url): try: logger.info('scrape '+url) tree=fetch(url) agents,committees=scrape_actors(tree) forecasts=lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0],forecastFields) events=scrape_events(tree) procedure=scrape_basic(tree) if not procedure: return ipext=[] for ipexd in IPEXMAP.get(procedure['reference'], {}).get('Dates',[]): skip=False for event in forecasts+events: if event['type'] in ipexevents.get(ipexd['type'],{}).get('oeil',[]) and event['date']==ipexd['date']: skip=True break if skip: continue ipext.append(ipexd) allevents=agents+scrape_docs(tree)+events+forecasts+ipext other=[x for x in allevents if not x.get('date')] allevents=sorted([x for x in allevents if x.get('date')],key=itemgetter('date')) allevents=merge_events(allevents,committees, agents) res={u'meta': {'source': url, 'timestamp': datetime.datetime.utcnow() }, u'procedure': procedure, u'links': form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]), u'committees': committees, u'activities': sorted(allevents, key=itemgetter('date')), u'other': other, } tmp=url.split('id=') if len(tmp)>1: res['meta']['id']=int(tmp[1]) # check for "final act" finalas=tree.xpath('//div[@id="final_act"]//a') final={} for link in finalas: if link.get('class')=='sumbutton': try: summary=fetch("http://www.europarl.europa.eu%s" % link.get('href')) except: continue final['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')] else: if not 'docs' in final: final['docs']=[] final['docs'].append({'title': link.xpath('text()')[0].strip(), 'url': link.get('href')}) if final and final.get('docs'): res[u'procedure'][u'final']=final.get('docs',[{}])[0] for item in res['activities']: if item.get('type')==u'Final act published in Official Journal': if final.get('text'): item[u'text']=final['text'] if len(final.get('docs'))>1: if not 'docs' in item: item[u'docs']=final['docs'] else: item[u'docs'].extend(final['docs']) break return res except: logger.error("%s\n%s" % (url,traceback.format_exc())) return
def getMEPDeclarations(id): try: dom = fetch("http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500]) except Exception, e: logger.error("mepdeclaration %s" % e) return []
def getMEPGender(id): try: mepraw=fetch("http://www.europarl.europa.eu/meps/fr/%s/_home.html" % (id), ignore=[500]) except Exception, e: logger.error("mepgender %s" % e) return 'n/a'
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = { u'active': False, u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)), u'meta': {u'url': url} } mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()')))) borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()') if len(borntxt)>0: if unws(borntxt[-1]).startswith('Date of death:'): try: data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y") except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) tmp = borntxt[-2].split(',', 1) else: tmp = borntxt[-1].split(',', 1) if len(tmp)==2: (d, p) = tmp else: d,p = tmp[0], None try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")} except ValueError: logger.warn(traceback.format_exc()) finally: if p: if 'Birth' in data: data[u'Birth'][u'place'] = unws(p) else: data[u'Birth'] = unws(p) else: logger.warn('[!] no birth data %s' % url) # scrape stuff from right column addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')]) addif(data,u'Homepage',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')]) addif(data,u'Twitter',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')]) addif(data,u'Facebook',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')]) addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1] for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')]) # contact information for span in root.xpath('//div[@id="content_right"]//h3'): title=unws(''.join(span.xpath('.//text()'))) if title == "Contacts": addif(data,u'Addresses',getAddress(span)) # scrape main content for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'): key=unws(''.join(section.xpath('.//text()'))) if key=="National parties": # constituencies key='Constituencies' for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, party = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if not key in data: data[key]=[] if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" cstart = party.rfind(' (') if party[cstart+2:-1] in SEIRTNUOC: country = party[cstart+2:-1] party = party[:cstart] else: logger.warn('unknown country: %s' % party[cstart+2:-1]) country='unknown' #print etree.tostring(constlm, pretty_print=True) data[key].append({ u'party': party, u'country': country, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor']: # memberships in various committees, delegations and EP mgt for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, org = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" item={u'role': key, u'abbr': COMMITTEE_MAP.get(org), u'Organization': org, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), } for start, field in orgmaps: if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break elif key == u'Political groups': for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) interval, org = line.split(' : ',1) tmp = org.split(u' - ') if len(tmp)>1: org = ' - '.join(tmp[:-1]) role = tmp[-1] elif org.endswith(' -'): org=org[:-2] role='' else: logger.error('[!] political group line %s' % line) continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" if not u'Groups' in data: data[u'Groups']=[] data[u'Groups'].append( {u'role': role, u'Organization': org, u'country': COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))), u'groupid': group_map[org], u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) else: logger.error('[!] unknown field %s' % key) # sort all lists in descending order for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']: if not fld in data: continue data[fld]=sorted(data[fld], key=lambda x: x.get('end',x['start']), reverse=True) # get CV - page (is on separate http path :/) cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid root = fetch(cvurl, ignore=[500]) data[u'CV']=[unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')] # get assistants also on a separate page :/ assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid root = fetch(assurl, ignore=[500]) for h3 in root.xpath('//h3[@id="section"]'): title=unws(''.join(h3.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower().split()[0], [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower(), [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) return data
def scrape(url, comid): root=fetch(url) lines=[x for x in root.xpath('//td[@class="contents"]/div/*') if unws(' '.join(x.xpath('.//text()')))] if not len(lines): return if not unws(' '.join(lines[2].xpath('.//text()')))=='DRAFT AGENDA': logger.error("NOT DRAFT AGENDA %s" % unws(' '.join(lines[2].xpath('.//text()')))) agenda={u'committee': comid, u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))), u'src': url, } i=1 if unws(' '.join(lines[3].xpath('.//text()')))=="INTERPARLIAMENTARY COMMITTEE MEETING": logger.warn("skipping interparl com meet") return if unws(' '.join(lines[6].xpath('.//text()'))).startswith('Room'): agenda.update({u'docid': unws(' '.join(lines[1].xpath('.//text()'))), u'type': unws(' '.join(lines[3].xpath('.//text()'))), u'time': toTime(unws(' '.join(lines[4].xpath('.//text()')))), u'city': unws(' '.join(lines[5].xpath('.//text()'))), u'room': unws(' '.join(lines[6].xpath('.//text()')))[6:], }) i=7 itemcnt=0 item={} schedule=None res=[] while i < len(lines): line=lines[i] i+=1 txt=unws(' '.join(line.xpath('.//text()'))) if txt in ['* * *', '***']: continue # skip end of schedule block # 20 December 2011, 16.00 – 16.30 tmp=toTime(txt) if tmp: schedule=tmp if i<len(lines) and unws(' '.join(lines[i].xpath('.//text()'))) == 'In camera': schedule[u'incamera']=True i+=1 continue if line.tag=='div': item[u'actors']=getactors(line) continue firsttoken=txt.split()[0] # 6. Alternative dispute resolution for consumer disputes and # amending Regulation (EC) No 2006/2004 and Directive # 2009/22/EC (Directive on consumer ADR) if firsttoken[-1]=='.' and firsttoken[:-1].isdigit() and itemcnt+1==int(firsttoken[:-1]): if item: res.append(item) itemcnt+=1 item=copy.deepcopy(agenda) item.update({u'title': ' '.join(txt.split()[1:]), u'seq_no': itemcnt,}) if schedule: item.update(schedule) continue # trailing list of "details" # · Presentation by the Commission of the proposal & Impact Assessment # · Exchange of views if firsttoken==u"·": if not 'list' in item: item[u'list']=[] tmp=' '.join(txt.split()[1:]) if tmp.startswith('Deadline for tabling amendments:'): try: item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d %B %Y, %H.%M") except ValueError: try: item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d.%m.%Y at %H.%M") except: logger.warn('[$] unknown tabling deadline format %s' % unws(tmp)) item[u'list'].append(tmp) continue # committee dossier # IMCO/7/08130 if txt.startswith("%s/7/" % comid) and len(txt)==12: item[u'comdossier']=txt continue # ***I 2011/0373(COD) COM(2011)0793 – C7-0454/2011 tmp=getdocs(txt) if tmp: item.update(tmp) continue # fall-through line logger.debug("(falltrough) %s %s" % (line.tag, txt.encode('utf8'))) if item: res.append(item) return res
def scrape(url): try: logger.info('scrape ' + url) tree = fetch(url) agents, committees = scrape_actors(tree) forecasts = lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0], forecastFields) events = scrape_events(tree) procedure = scrape_basic(tree) ipext = [] for ipexd in (IPEXMAP[procedure['reference']] or {}).get('Dates', []): skip = False for event in forecasts + events: if event['type'] == ipexevents.get(ipexd['type'], {}).get( 'oeil', 'asdf') and event['date'] == ipexd['date']: skip = True break if skip: continue ipext.append(ipexd) allevents = agents + scrape_docs(tree) + events + forecasts + ipext other = [x for x in allevents if not x.get('date')] allevents = sorted([x for x in allevents if x.get('date')], key=itemgetter('date')) allevents = merge_events(allevents, committees) res = { u'meta': { 'source': url, 'id': int(url.split('id=')[1]), 'timestamp': datetime.datetime.utcnow() }, u'procedure': procedure, u'links': form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]), u'committees': committees, u'activities': sorted(allevents, key=itemgetter('date')), u'other': other, } # check for "final act" finalas = tree.xpath('//div[@id="final_act"]//a') final = {} for link in finalas: if link.get('class') == 'sumbutton': try: summary = fetch("http://www.europarl.europa.eu%s" % link.get('href')) except: continue final['text'] = [ tostring(x) for x in summary.xpath('//div[@id="summary"]') ] else: if not 'docs' in final: final['docs'] = [] final['docs'].append({ 'title': link.xpath('text()')[0].strip(), 'url': link.get('href') }) if final and final.get('docs'): res[u'procedure'][u'final'] = final.get('docs', [{}])[0] for item in res['activities']: if item.get( 'type') == u'Final act published in Official Journal': if final.get('text'): item[u'text'] = final['text'] if len(final.get('docs')) > 1: if not 'docs' in item: item[u'docs'] = final['docs'] else: item[u'docs'].extend(final['docs']) break return res except: logger.error("%s\n%s" % (url, traceback.format_exc())) return
def scrape(url): try: logger.info("scrape " + url) tree = fetch(url) agents, committees = scrape_actors(tree) forecasts = lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0], forecastFields) events = scrape_events(tree) procedure = scrape_basic(tree) ipext = [] for ipexd in (IPEXMAP[procedure["reference"]] or {}).get("Dates", []): skip = False for event in forecasts + events: if ( event["type"] == ipexevents.get(ipexd["type"], {}).get("oeil", "asdf") and event["date"] == ipexd["date"] ): skip = True break if skip: continue ipext.append(ipexd) allevents = agents + scrape_docs(tree) + events + forecasts + ipext other = [x for x in allevents if not x.get("date")] allevents = sorted([x for x in allevents if x.get("date")], key=itemgetter("date")) allevents = merge_events(allevents, committees) res = { u"meta": {"source": url, "id": int(url.split("id=")[1]), "timestamp": datetime.datetime.utcnow()}, u"procedure": procedure, u"links": form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]), u"committees": committees, u"activities": sorted(allevents, key=itemgetter("date")), u"other": other, } # check for "final act" finalas = tree.xpath('//div[@id="final_act"]//a') final = {} for link in finalas: if link.get("class") == "sumbutton": try: summary = fetch("http://www.europarl.europa.eu%s" % link.get("href")) except: continue final["text"] = [tostring(x) for x in summary.xpath('//div[@id="summary"]')] else: if not "docs" in final: final["docs"] = [] final["docs"].append({"title": link.xpath("text()")[0].strip(), "url": link.get("href")}) if final and final.get("docs"): res[u"procedure"][u"final"] = final.get("docs", [{}])[0] for item in res["activities"]: if item.get("type") == u"Final act published in Official Journal": if final.get("text"): item[u"text"] = final["text"] if len(final.get("docs")) > 1: if not "docs" in item: item[u"docs"] = final["docs"] else: item[u"docs"].extend(final["docs"]) break return res except: logger.error("%s\n%s" % (url, traceback.format_exc())) return