def scrape_docs(tree): res=[] docs=tree.xpath('//table[@id="doc_gateway"]') tabs=[x.xpath('preceding-sibling::h2')[0].xpath('text()')[0] for x in docs] for inst, table in izip(tabs, docs): if inst in instmap.keys(): for doc in lst2obj(table, docFields): if inst != 'Other institutions': doc[u'body']=instmap[inst] else: try: doc[u'body']=otherinst[doc['type'].split(':')[0]] except KeyError: doc[u'body']='' if doc['body'] in ['EP','CSL'] and doc['type']=='Joint text approved by Conciliation Committee co-chairs': # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :) continue if doc.get('text'): try: summary=fetch(doc['text']['url']) except: continue doc[u'text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')] res.append(doc) elif inst != 'All': logger.warn(u"[!] unrecognized tab in documents %s" % inst) return res
def scrape_docs(tree): res = [] docs = tree.xpath('//table[@id="doc_gateway"]') tabs = [ x.xpath('preceding-sibling::h2')[0].xpath('text()')[0] for x in docs ] for inst, table in izip(tabs, docs): if inst in instmap.keys(): for doc in lst2obj(table, docFields): if inst != 'Other institutions': doc[u'body'] = instmap[inst] else: try: doc[u'body'] = otherinst[doc['type'].split(':')[0]] except KeyError: doc[u'body'] = '' if doc['body'] in ['EP', 'CSL'] and doc[ 'type'] == 'Joint text approved by Conciliation Committee co-chairs': # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :) continue if doc.get('text'): try: summary = fetch(doc['text']['url']) except: continue doc[u'text'] = [ unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]') ] res.append(doc) elif inst != 'All documents': logger.warn(u"[!] unrecognized tab in documents %s" % inst) return res
def save(data, stats): if not data: return stats res = db.eurlex.find_one({'id.celexid': data['id']['celexid']}) or {} d = diff( dict([(k, v) for k, v in res.items() if not k in ['_id', 'meta', 'changes']]), dict([(k, v) for k, v in data.items() if not k in [ '_id', 'meta', 'changes', ]])) if d: now = unicode(datetime.utcnow().replace(microsecond=0).isoformat()) if not res: logger.info(('adding %s' % (data['id']['celexid'])).encode('utf8')) data['meta']['created'] = now if stats: stats[0] += 1 else: logger.info( ('updating %s' % (data['id']['celexid'])).encode('utf8')) logger.warn(d) data['meta']['updated'] = now if stats: stats[1] += 1 data['_id'] = res['_id'] data['changes'] = res.get('changes', {}) data['changes'][now] = d db.eurlex.save(data) if stats: return stats else: return data
def scrape(celexid, path): logger.info("scraping %s%s:NOT" % (EURLEXURL,celexid)) path.reverse() (code,lang)=celexid.split(":")[1:3] st=6 if len(code)>6: if code[6].isalpha(): st=7 eurlex={'id': {u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:st], u'refno': code[st:], u'lang': lang, u'chapter': path, }} else: eurlex={'id': {u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:6], u'lang': lang, u'chapter': path, }} try: eurlex['id'][u'typeDesc']= CELEXCODES[code[0]]['Document Types'][code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector'] except: eurlex['id'][u'typeDesc']= u"Unknown" logger.warn("[!] unknown typedesc %s" % celexid) eurlex['meta']={u'src': "%s%s:NOT" % (EURLEXURL,celexid)} root = fetch("%s%s:NOT" % (EURLEXURL,celexid)) if len(root.xpath('//h1[text()="No documents matching criteria."]'))>0: logger.warn('[!] nothing to scrape here: %s', "%s%s:NOT" % (EURLEXURL,celexid)) return eurlex[u'title'] = root.xpath('//h2[text()="Title and reference"]/following-sibling::p/text()')[0] # dates dates=root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()') for y in dates: if not unws(y): continue title, rest=unws(y).split(": ",1) item={u'type': title} date=rest[:10] tail=rest[10:] if tail.startswith('; '): tail=tail[2:] if date=='99/99/9999': item[u'date']= datetime(9999,12,31) elif date=='00/00/0000': item[u'date']= datetime(0001,01,01) elif date=='//': continue else: try: item[u'date']= datetime.strptime(date, u"%d/%m/%Y") except ValueError: try: item[u'date']= datetime.strptime(date, u"%m/%d/%Y") except: pass if len(tail): item['note']=tail try: eurlex['dates'].append(item) except: eurlex['dates']=[item]
def scrape_actors(tree): insts=tree.xpath('//td[@class="players_institution" or @class="players_institution inst_separator"]') agents=[] meps=[] for inst in insts: inst_name=''.join([x.strip() for x in inst.xpath('.//text()')]) for table in inst.xpath('following-sibling::td/table'): if inst_name == 'European Parliament': meps.extend([x for x in scrape_epagents(table) if x not in meps]) # Handle council elif inst_name == 'Council of the European Union': for agent in lst2obj(table, cslagents, 1): agent[u'body']=u'CSL' agent[u'type']=u'Council Meeting' agents.append(agent) # and commission elif inst_name == 'European Commission': for p in table.xpath('.//p[@class="players_head"]'): p.getparent().remove(p) for agent in lst2obj(table, ecagents, 0): if len(agent['dg'])==len(agent['commissioner']): for dg,cmnr in izip(agent['dg'], agent['commissioner']): agent[u'body']=u'EC' agents.append({u'body': u'EC', u'dg': dg, u'commissioner': cmnr}) else: logger.warn("commission data wrong: %s" % (agent)) else: "[!] wrong institution name", inst_name return (agents, sorted(meps,key=itemgetter('committee')))
def save(data, stats): res=db.ep_meps2.find_one({ 'UserID' : data['UserID'] }) or {} if 'Gender' not in data and 'Gender' in res: data['Gender']=res['Gender'] d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes', 'activities',]]), dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes', 'activities',]])) if d: now=datetime.utcnow().replace(microsecond=0) if not res: logger.info(('adding %s' % (data['Name']['full'])).encode('utf8')) data['meta']['created']=now if stats: stats[0]+=1 else: logger.info(('updating %s' % (data['Name']['full'])).encode('utf8')) logger.warn(jdump(d)) data['meta']['updated']=now if stats: stats[1]+=1 data['_id']=res['_id'] data['changes']=res.get('changes',{}) data['changes'][now.isoformat()]=d db.ep_meps2.save(data) del res if stats: del data return stats else: return data
def save(data, stats): res = db.ep_meps2.find_one({"UserID": data["UserID"]}) or {} d = diff( dict([(k, v) for k, v in res.items() if not k in ["_id", "meta", "changes"]]), dict([(k, v) for k, v in data.items() if not k in ["_id", "meta", "changes"]]), ) if d: now = datetime.utcnow().replace(microsecond=0) if not res: logger.info(("adding %s" % (data["Name"]["full"])).encode("utf8")) data["meta"]["created"] = now if stats: stats[0] += 1 else: logger.info(("updating %s" % (data["Name"]["full"])).encode("utf8")) logger.warn(jdump(d)) data["meta"]["updated"] = now if stats: stats[1] += 1 data["_id"] = res["_id"] data["changes"] = res.get("changes", {}) data["changes"][now.isoformat()] = d db.ep_meps2.save(data) if stats: return stats else: return data
def getMEPRef(name, retfields=['_id']): if not name: return mep = db.ep_meps2.find_one({'Name.aliases': ''.join(name.split()).lower()}, retfields) if not mep and u'ß' in name: mep = db.ep_meps2.find_one( { 'Name.aliases': ''.join(name.replace(u'ß', 'ss').split()).lower() }, retfields) if not mep and unicodedata.normalize('NFKD', unicode(name)).encode( 'ascii', 'ignore') != name: mep = db.ep_meps2.find_one( { 'Name.aliases': ''.join( unicodedata.normalize('NFKD', unicode(name)).encode( 'ascii', 'ignore').split()).lower() }, retfields) if not mep and len([x for x in name if ord(x) > 128]): mep = db.ep_meps2.find_one( { 'Name.aliases': re.compile(''.join([x if ord(x) < 128 else '.' for x in name]), re.I) }, retfields) if mep: return mep['_id'] else: logger.warn('[!] lookup oops %s' % name.encode('utf8'))
def getactors(node): res={} ax=[None,[]] for row in node.xpath('.//tr'): cells=row.xpath('./td/p') if not cells: continue # get role Rapporteur|Responsible|Rapporteur for the opinion|Opinions role=cells[0].xpath('text()') if role and unws(role[0]): if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1]) tmp=unws(role[0])[:-1] if tmp=="Rapporteur for the opinion": tmp="Rapporteur" ax=[tmp,[]] tmp=unws((cells[1].xpath('text()') or [None])[0]) if ax[0] in ["Rapporteur", "Rapporteur for the opinion"] and tmp: name=' '.join(tmp.split()[:-1]) item={u'group': tmp.split()[-1][1:-1], u'name': name, u'mepref': getMEPRef(name) } if len(cells)>2: item[u'docs']=getdoclist(cells[2]) ax[1].append(item) continue if ax[0] in ["Opinions", "Responsible"] and tmp: tmp1=tmp.split(u' –',1) if len(tmp1)==2: (comid, rest)=tmp1 elif len(tmp1)==1: skip=False for com in tmp.split(' ,'): if com in COMMITTEE_MAP and len(com)==4: ax[1].append({u'comid': com}) skip=True if skip: continue else: logger.warn("[!] unknown committee: %s" % tmp) raise item={u'comid': comid} if rest==' Decision: no opinion': item[u'response']=u'Decision: no opinion' if not rest and len(comid)>4: for com in comid.split(', '): ax[1].append({u'comid': com}) continue if len(cells)>2: tmp=unws((cells[2].xpath('text()') or [None])[0]) if tmp: name=' '.join(tmp.split()[:-1]) item.update({u'group': tmp.split()[-1][1:-1], u'name': name, u'mepref': getMEPRef(name)}) if len(cells)>3: item[u'docs']=getdoclist(cells[3]) ax[1].append(item) if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1]) return res
def crawl(saver=jdump, null=False): for celexid, data in sources("%s/index.htm" % crawlroot, []): if (null and db.eurlex.find_one({'id.celexid': celexid},['_id'])==None) or not null: try: tmp = saver(scrape(celexid, data),[0,0]) except: logger.warn("[!] failed to scrape %s" % celexid) continue yield tmp
def crawl(saver=jdump, null=False): for celexid, data in sources("%s/index.htm" % crawlroot, []): if (null and db.eurlex.find_one({'id.celexid': celexid}, ['_id']) == None) or not null: try: tmp = saver(scrape(celexid, data), [0, 0]) except: logger.warn("[!] failed to scrape %s" % celexid) continue yield tmp
def seqcrawler(saver=jdump): stats=[0,0] for u, com in getComAgendas(): try: saver(scrape(u,com), stats) except: # ignore failed scrapes logger.warn("[!] failed to scrape: %s" % u) logger.warn(traceback.format_exc()) logger.info("[o] added/updated: %s/%s" % (stats[0],stats[1]))
def getMEPRef(name, retfields=['_id']): if not name: return mep=db.ep_meps2.find_one({'Name.aliases': ''.join(name.split()).lower()},retfields) if not mep and u'ß' in name: mep=db.ep_meps2.find_one({'Name.aliases': ''.join(name.replace(u'ß','ss').split()).lower()},retfields) if not mep and unicodedata.normalize('NFKD', unicode(name)).encode('ascii','ignore')!=name: mep=db.ep_meps2.find_one({'Name.aliases': ''.join(unicodedata.normalize('NFKD', unicode(name)).encode('ascii','ignore').split()).lower()},retfields) if not mep and len([x for x in name if ord(x)>128]): mep=db.ep_meps2.find_one({'Name.aliases': re.compile(''.join([x if ord(x)<128 else '.' for x in name]),re.I)},retfields) if mep: return mep['_id'] else: logger.warn('[!] lookup oops %s' % name.encode('utf8'))
def crawler(saver=jdump, update=False): stats=[0,0] for pdf, rapporteur in getComAms(update=update): logger.info(datetime.now().isoformat()+" "+pdf) ctr=[0,0] try: saver(scrape(pdf, rapporteur), ctr) except: # ignore failed scrapes logger.warn("[!] %s failed to scrape: %s" % (datetime.now().isoformat(), pdf)) #logger.warn(traceback.format_exc()) raise logger.info("%s [i] added/updated: %s/%s" % (datetime.now().isoformat(), ctr[0],ctr[1])) stats[0]+=ctr[0] stats[1]+=ctr[1] logger.info("%s [o] total added/updated: %s/%s" % (datetime.now().isoformat(),stats[0],stats[1]))
def crawler(saver=jdump, update=False): stats = [0, 0] for pdf, rapporteur in getComAms(update=update): logger.info(datetime.now().isoformat() + " " + pdf) ctr = [0, 0] try: saver(scrape(pdf, rapporteur), ctr) except: # ignore failed scrapes logger.warn("[!] %s failed to scrape: %s" % (datetime.now().isoformat(), pdf)) # logger.warn(traceback.format_exc()) raise logger.info("%s [i] added/updated: %s/%s" % (datetime.now().isoformat(), ctr[0], ctr[1])) stats[0] += ctr[0] stats[1] += ctr[1] logger.info("%s [o] total added/updated: %s/%s" % (datetime.now().isoformat(), stats[0], stats[1]))
def save(data, stats): res=db.ep_meps2.find_one({ 'UserID' : data['UserID'] }) or {} d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]), dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]])) if d: now=unicode(datetime.utcnow().replace(microsecond=0).isoformat()) if not res: logger.info(('adding %s' % (data['Name']['full'])).encode('utf8')) data['meta']['created']=now stats[0]+=1 else: logger.info(('updating %s' % (data['Name']['full'])).encode('utf8')) logger.warn(d) data['meta']['updated']=now stats[1]+=1 data['_id']=res['_id'] data['changes']=res.get('changes',{}) data['changes'][now]=d db.ep_meps2.save(data) return stats
def save(data, stats): if not data: return stats res=db.eurlex.find_one({ 'id.celexid' : data['id']['celexid'] }) or {} d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]), dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]])) if d: now=unicode(datetime.utcnow().replace(microsecond=0).isoformat()) if not res: logger.info(('adding %s' % (data['id']['celexid'])).encode('utf8')) data['meta']['created']=now if stats: stats[0]+=1 else: logger.info(('updating %s' % (data['id']['celexid'])).encode('utf8')) logger.warn(d) data['meta']['updated']=now if stats: stats[1]+=1 data['_id']=res['_id'] data['changes']=res.get('changes',{}) data['changes'][now]=d db.eurlex.save(data) if stats: return stats else: return data
def scrape_actors(tree): insts = tree.xpath( '//td[@class="players_institution" or @class="players_institution inst_separator"]' ) agents = [] meps = [] for inst in insts: inst_name = ''.join([x.strip() for x in inst.xpath('.//text()')]) for table in inst.xpath('following-sibling::td/table'): if inst_name == 'European Parliament': meps.extend( [x for x in scrape_epagents(table) if x not in meps]) # Handle council elif inst_name == 'Council of the European Union': for agent in lst2obj(table, cslagents, 1): agent[u'body'] = u'CSL' agent[u'type'] = u'Council Meeting' agents.append(agent) # and commission elif inst_name == 'European Commission': for p in table.xpath('.//p[@class="players_head"]'): p.getparent().remove(p) for agent in lst2obj(table, ecagents, 0): if len(agent['dg']) == len(agent['commissioner']): for dg, cmnr in izip(agent['dg'], agent['commissioner']): agent[u'body'] = u'EC' agents.append({ u'body': u'EC', u'dg': dg, u'commissioner': cmnr }) else: logger.warn("commission data wrong: %s" % (agent)) else: "[!] wrong institution name", inst_name return (agents, sorted(meps, key=itemgetter('committee')))
def parseMember(userid): url = "http://www.europarl.europa.eu/meps/en/%s/get.html" % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = {u"active": False, "meta": {u"url": url}} # return {'active': False} mepdiv = root.xpath('//div[@class="ep_elementpeople2"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u"Name"] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0])) data[u"Photo"] = unicode(urljoin(BASE_URL, mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get("src")), "utf8") borntxt = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()') if len(borntxt) > 0: (d, p) = borntxt[0].split(",", 1) try: data[u"Birth"] = {u"date": datetime.strptime(unws(d), u"Born on %d %B %Y"), u"place": unws(p)} except ValueError: logger.warn("[!] failed to scrape birth data %s" % url) logger.warn(traceback.format_exc()) else: logger.warn("[!] no birth data %s" % url) const = {u"country": unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0]), u"start": datetime(2009, 7, 14)} data[u"Constituencies"] = [const] try: data[u"party"] = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]) except IndexError: pass else: group = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0]) try: role = unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]) except IndexError: role = u"Member" data[u"Groups"] = [{u"role": role, u"Organization": group, u"groupid": group_map[group]}] cdiv = root.xpath('//div[@class="ep_elementcontact"]') if len(cdiv): addif( data, u"RSS", [unicode(urljoin(BASE_URL, x.get("href")), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')], ) addif( data, u"Homepage", [unicode(x.get("href"), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')] ) addif( data, u"Mail", [decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))], ) for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'): title = unws("".join(span.xpath(".//text()"))) if title in ["Accredited assistants", "Local assistants"]: if not "assistants" in data: data["assistants"] = {} addif( data["assistants"], title.lower().split()[0], [unws(x) for x in span.xpath("../../..//li/div/text()")] ) addif(data, u"Addresses", getAddress(root)) for div in root.xpath('//div[@class="ep_content"]'): key = unws(u"".join(div.xpath('.//span[@class="ep_title"]/text()'))) if not len(key): continue elif key.lower() == "curriculum vitae": data[u"CV"] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')] elif key in ["Member", "Substitute", "Chair", "Vice-Chair", "Co-President", "President", "Vice-President"]: for span in div.xpath('.//span[@class="commission_label"]'): item = {u"role": key, u"abbr": unws("".join(span.xpath(".//text()"))), u"Organization": unws(span.tail)} for start, field in orgmaps: if item["abbr"] in COMMITTEE_MAP or item["Organization"].startswith(start): if not field in data: data[field] = [] if field == "Committees" and item["Organization"] in COMMITTEE_MAP: item[u"committee_id"] = COMMITTEE_MAP[item["Organization"]] data[field].append(item) break else: logger.error("[!] unknown field %s" % key) return data
try: mepraw=fetch("http://www.europarl.europa.eu/meps/fr/%s/_home.html" % (id), ignore=[500]) except Exception, e: logger.error("mepgender %s" % e) return 'n/a' borntxt=mepraw.xpath('//div[@class="zone_info_mep_transparent_mep_details"]//span[@class="more_info"]/text()') if len(borntxt)>0: if unws(borntxt[-1]).startswith(u'décédé'): hint=borntxt[-2].replace(u"\u00A0",' ').split()[0] else: hint=borntxt[-1].replace(u"\u00A0",' ').split()[0] if hint==u"Née": return "F" elif hint==u"Né": return "M" logger.warn('[!] no birth/gender data http://www.europarl.europa.eu/meps/fr/%s/get.html' % id) return 'n/a' def getMEPDeclarations(id): try: dom = fetch("http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500]) except Exception, e: logger.error("mepdeclaration %s" % e) return [] dif_links = dom.xpath('//h3[@id="sectionDIF"]/following-sibling::div//ul[@class="link_collection_noborder"]//a[@class="link_pdf"]/@href') dat_links = dom.xpath('//h3[@id="sectionDAT"]/following-sibling::div//ul[@class="link_collection_noborder"]//a[@class="link_pdf"]/@href') if not dif_links: logger.warn('[!] no declaration data http://www.europarl.europa.eu/meps/en/%s/_declarations.html' % id) return dif_links, dat_links activitymap={"CRE" : "Speeches",
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = { u'active': False, u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)), u'meta': {u'url': url} } mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()')))) borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()') if len(borntxt)>0: if unws(borntxt[-1]).startswith('Date of death:'): try: data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y") except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) tmp = borntxt[-2].split(',', 1) else: tmp = borntxt[-1].split(',', 1) if len(tmp)==2: (d, p) = tmp else: d,p = tmp[0], None try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")} except ValueError: logger.warn(traceback.format_exc()) finally: if p: if 'Birth' in data: data[u'Birth'][u'place'] = unws(p) else: data[u'Birth'] = unws(p) else: logger.warn('[!] no birth data %s' % url) # scrape stuff from right column addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')]) addif(data,u'Homepage',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')]) addif(data,u'Twitter',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')]) addif(data,u'Facebook',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')]) addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1] for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')]) # contact information for span in root.xpath('//div[@id="content_right"]//h3'): title=unws(''.join(span.xpath('.//text()'))) if title == "Contacts": addif(data,u'Addresses',getAddress(span)) # scrape main content for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'): key=unws(''.join(section.xpath('.//text()'))) if key=="National parties": # constituencies key='Constituencies' for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, party = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if not key in data: data[key]=[] if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" cstart = party.rfind(' (') if party[cstart+2:-1] in SEIRTNUOC: country = party[cstart+2:-1] party = party[:cstart] else: logger.warn('unknown country: %s' % party[cstart+2:-1]) country='unknown' #print etree.tostring(constlm, pretty_print=True) data[key].append({ u'party': party, u'country': country, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor']: # memberships in various committees, delegations and EP mgt for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, org = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" item={u'role': key, u'abbr': COMMITTEE_MAP.get(org), u'Organization': org, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), } for start, field in orgmaps: if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break elif key == u'Political groups': for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) interval, org = line.split(' : ',1) tmp = org.split(u' - ') if len(tmp)>1: org = ' - '.join(tmp[:-1]) role = tmp[-1] elif org.endswith(' -'): org=org[:-2] role='' else: logger.error('[!] political group line %s' % line) continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" if not u'Groups' in data: data[u'Groups']=[] data[u'Groups'].append( {u'role': role, u'Organization': org, u'country': COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))), u'groupid': group_map[org], u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) else: logger.error('[!] unknown field %s' % key) # sort all lists in descending order for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']: if not fld in data: continue data[fld]=sorted(data[fld], key=lambda x: x.get('end',x['start']), reverse=True) # get CV - page (is on separate http path :/) cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid root = fetch(cvurl, ignore=[500]) data[u'CV']=[unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')] # get assistants also on a separate page :/ assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid root = fetch(assurl, ignore=[500]) for h3 in root.xpath('//h3[@id="section"]'): title=unws(''.join(h3.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower().split()[0], [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower(), [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) return data
def scrape(url, rapporteur=None): if (url in ['http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-483.680%2b02%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.387%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-456.679%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-494.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.705%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.767%2b02%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.385%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-465.012%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-496.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.724%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.721%2b02%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.723%2b03%2bDOC%2bPDF%2bV0%2f%2fEN'] or not url.endswith('EN')): logger.info("skipping unparsable url") return [] prolog=True res=[] block=None reference=None date=None committee=[] text=getraw(url).split('\n') for line in text: if prolog: if amstart.match(line): if reference==None: logger.warn("%s [!] couldn't find ref: %s" % (datetime.now().isoformat(), unws([x for x in text[:20] if unws(x)][2]))) # marking as scraped though db.ep_ams.save({'src': url, 'error': "couldn't find reference in source pdf"}) return [] if date==None or committee==[]: return [] #raise ValueError block=[line] prolog=False continue line=unws(line) if not line: continue if line in COMMITTEE_MAP: committee.append(COMMITTEE_MAP[line]) continue if (committee and not reference and re.match(refre, line)): reference=line if url == 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN': logger.info("adjusting reference to eudatap") reference="2012/0011(COD)" continue if (reference and not date): try: date = parse(unws(line), dayfirst=True) except ValueError: pass except TypeError: pass continue if amstart.match(line): # parse block res.append(parse_block(block, url, reference, date, committee, rapporteur)) block=[line] continue block.append(line) if block and filter(None,block): res.append(parse_block(block, url, reference, date, committee, rapporteur)) return res
def scrape(url, rapporteur=None): if url in [ "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-483.680%2b02%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.387%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-456.679%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-494.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.705%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.767%2b02%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.385%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-465.012%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-496.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.724%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.721%2b02%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.723%2b03%2bDOC%2bPDF%2bV0%2f%2fEN", ] or not url.endswith("EN"): logger.info("skipping unparsable url") return [] prolog = True res = [] block = None reference = None date = None committee = [] text = getraw(url).split("\n") for line in text: if prolog: if amstart.match(line): if reference == None: logger.warn( "%s [!] couldn't find ref: %s" % (datetime.now().isoformat(), unws([x for x in text[:20] if unws(x)][2])) ) # marking as scraped though db.ep_ams.save({"src": url, "error": "couldn't find reference in source pdf"}) return [] if date == None or committee == []: return [] # raise ValueError block = [line] prolog = False continue line = unws(line) if not line: continue if line in COMMITTEE_MAP: committee.append(COMMITTEE_MAP[line]) continue if committee and not reference and re.match(refre, line): reference = line if ( url == "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN" ): logger.info("adjusting reference to eudatap") reference = "2012/0011(COD)" continue if reference and not date: try: date = parse(unws(line), dayfirst=True) except ValueError: pass except TypeError: pass continue if amstart.match(line): # parse block res.append(parse_block(block, url, reference, date, committee, rapporteur)) block = [line] continue block.append(line) if block and filter(None, block): res.append(parse_block(block, url, reference, date, committee, rapporteur)) return res
def getMEPGender(id): try: mepraw = fetch("http://www.europarl.europa.eu/meps/fr/%s/get.html" % (id), ignore=[500]) except Exception, e: logger.error("mepgender %s" % e) return "n/a" borntxt = mepraw.xpath('//div[@class="ep_elementpeople2"]//div[@class="ep_elementtext"]/p/text()') if len(borntxt) > 0: hint = borntxt[0].replace(u"\u00A0", " ").split()[0] if hint == u"Née": return "F" elif hint == u"Né": return "M" logger.warn("[!] no birth/gender data http://www.europarl.europa.eu/meps/fr/%s/get.html" % id) return "n/a" def parseMember(userid): url = "http://www.europarl.europa.eu/meps/en/%s/get.html" % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = {u"active": False, "meta": {u"url": url}} # return {'active': False} mepdiv = root.xpath('//div[@class="ep_elementpeople2"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u"Name"] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0])) data[u"Photo"] = unicode(urljoin(BASE_URL, mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get("src")), "utf8")
def parse_block(block, url, reference, date, committee, rapporteur): am={u'src': url, u'reference': reference, u'date': date, u'committee': committee} #logger.info(block) # get title try: am[u'seq']=int(unws(block[0]).split()[1]) except ValueError: am[u'seq']=unws(block[0]).split()[1] except IndexError: logger.warn("%s wrong seq %s" % (datetime.now().isoformat(), block[0])) am[u'seq']=unws(block[0]) del block[0] strip(block) # find and strip justification i=len(block)-1 while i>2 and not (unws(block[i])=="Justification" and block[i].startswith(' ' * 6)): i-=1 if i>2: if i<len(block)-1 and (not unws(block[i+1]) or not block[i+1].startswith(' ') ): am['justification']='\n'.join(block[i+2:]) del block[i:] strip(block) else: logger.warn("%s wrong justification\n%s" % (datetime.now().isoformat(), '\n'.join(block[i:]))) # get original language if 4<len(unws(block[-1]))<=6 and unws(block[-1]).startswith('Or.'): am['orig_lang']=unws(block[-1])[4:] del block[-1] strip(block) # find split column new/old heading i=len(block)-1 while (i>2 and not ((block[i].endswith(" Amendment") or block[i].endswith(" PARTICULARS") or block[i].endswith(" Remedy") or block[i].endswith(" Amended text") or block[i].endswith(" Amendement") or block[i].endswith(" Amendments by Parliament") or block[i].endswith(" Proposal for rejection") or block[i].endswith(" Proposal for a rejection") or block[i].endswith(" Does not affect English version") or block[i].endswith(" (Does not affect English version)") or block[i].endswith(" Amendment by Parliament")) and len(block[i])>33) and not (unws(block[i])=='Text proposed by the Commission' or unws(block[i]) in types)): i-=1 if i>2: #if block[i].endswith(" Proposal for rejection"): # pass # location will be possibly '-' seq=False if unws(block[i]) in ["Amendment", "Amendment by Parliament"]: # sequential format, search for preceeding original text j=i while (j>2 and not (unws(block[j]) in types or unws(block[j])=='Text proposed by the Commission')): j-=1 if j>2: i=j seq=True; key='old' elif unws(block[i])=='Text proposed by the Commission' or block[i].strip() in types: seq=True; key='old' # throw headers del block[i] while i<len(block) and not unws(block[i]): del block[i] # skip blank lines mid=max([len(x) for x in block])/2 while i<len(block): if seq: if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]: key='new' del block[i] continue try: am[key].append(block[i]) except KeyError: am[key]=[block[i]] del block[i] continue # only new, old is empty if block[i].startswith(' '): try: am['new'].append(unws(block[i])) except KeyError: am['new']=[unws(block[i])] del block[i] continue newstart = block[i].rstrip().rfind(' ') # only old, new is empty if newstart < 6: try: am['old'].append(unws(block[i])) except KeyError: am['old']=[unws(block[i])] del block[i] continue #mid=len(block[i])/2 #mid=40 lsep=block[i].rfind(' ', 0, mid) # todo calculate both, and use the one closer to the center rsep=block[i].find(' ', mid) sep=None if abs(lsep-mid)<abs(rsep-mid): if abs(lsep-mid)<15: sep=lsep else: if abs(rsep-mid)<15: sep=rsep if sep: try: am['old'].append(unws(block[i][:sep])) except KeyError: am['old']=[unws(block[i][:sep])] try: am['new'].append(unws(block[i][sep:])) except KeyError: am['new']=[unws(block[i][sep:])] else: # no sane split found #logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(), # (sep, mid, len(block[i]), newstart, block[i]), # block[i][mid-1:mid+2])) # fallback to naive splitting try: am['old'].append(unws(block[i][:newstart])) except KeyError: am['old']=[unws(block[i][:newstart])] try: am['new'].append(unws(block[i][newstart:])) except KeyError: am['new']=[unws(block[i][newstart:])] del block[i] strip(block) else: logger.warn("%s no table\n%s" % (datetime.now().isoformat(), '\n'.join(block[i:]))) am['content']=block[i:] return am i=0 # find end of authors while (i<len(block) and unws(block[i]) and not unws(block[i]).lower().startswith('compromise') and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts): i+=1 if i<len(block): if i>0: names=' '.join(block[:i]) am['authors']=names #logger.info("names \n%s" % names) # convert to pt mep _ids for text in filter(None,splitNames(names)): mep=getMep(text,None,False) if mep: try: am['meps'].append(mep['UserID']) except KeyError: am['meps']=[mep['UserID']] else: logger.info("fix %s" % text) del block[:i] strip(block) elif rapporteur: am['authors']=rapporteur for text in filter(None,splitNames(rapporteur)): mep=getMep(text,None,False) if mep: try: am['meps'].append(mep['UserID']) except KeyError: am['meps']=[mep['UserID']] else: logger.info("fix %s" % text) else: logger.info("%s no authors in Amendment %s" % (datetime.now().isoformat(), am['seq'])) else: logger.warn("%s no boundaries in Amendment %s\n%s" % (datetime.now().isoformat(), am['seq'], '\n'.join(block))) am['rest']=block return am # handle compromise info i=0 while (i<len(block) and unws(block[i]) and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts): i+=1 if i<len(block) and i>0: am['compromise']=block[:i] del block[:i] strip(block) i=0 while (i<len(block) and unws(block[i])): if unws(block[i]).split()[0] in locstarts: try: am['location'].append((' '.join(block[:i]),unws(block[i]))) except KeyError: am['location']=[(' '.join(block[:i]),unws(block[i]))] del block[:i+1] i=0 else: i+=1 if len(block)>0 and ((len(block)==1 or not unws(block[1])) and unws(block[0])!='1' and 'location' in am): am['location'][-1]=(am['location'][-1][0],"%s %s" % (am['location'][-1][1],block[0])) del block[0] strip(block) if block: if not ((len(block)==3 and unws(block[0])=='1' and not unws(block[1]) and block[2].startswith(" ")) or (len(block)==2 and unws(block[0])=='1' and block[1].startswith(" "))): # ignore obvious footnotes logger.info("rest in Amendment %s\n%s" % (am['seq'],'\n'.join(block))) return am
def merge_events(events, committees, agents): bydate = {} for event in events: if not event['date'] in bydate: bydate[event['date']] = [event] else: bydate[event['date']].append(event) #pprint.pprint(sorted([(k,[dict([(k1,v1) for k1,v1 in i.items() if k1!='text']) for i in v]) for k,v in bydate.items()])) res = [] # merge items to events. for date, items in bydate.items(): actors = {} # collects items/actor for a given date for item in items: if not item.get('body'): # find out body, or drop body = stage2inst.get(item.get('type')) if body: item[u'body'] = body elif item.get( 'type') == 'Final act published in Official Journal': # this really has no body or all res.append(item) continue else: logger.warn('unknown body: %s' % item.get('type')) item[u'body'] = 'unknown' # new institution for this date if not item['body'] in actors: # new body for this date actors[item['body']] = item if 'doc' in actors[item['body']]: docs = merge_new_docs(actors[item['body']]['doc'], item) del actors[item['body']]['doc'] actors[item['body']][u'docs'] = docs cmts = getCommittee(item, committees) if cmts: actors[item['body']][u'committees'] = sorted( cmts, key=itemgetter('committee')) if item['body'] == 'EC': actors[u'EC'][u'commission'] = sorted( [{ u'DG': x['dg'], u'Commissioner': x['commissioner'] } if x.get('commissioner') else { u'DG': x['dg'] } for x in agents if x['body'] == 'EC']) continue # merge any docs if 'doc' in item: docs = merge_new_docs(item['doc'], item) for doc in docs: skip = False # update docs, that are already in there, but with a different 'type' for cdoc in actors[item['body']].get('docs', []): if cdoc.get('url') == doc.get('url') or cdoc.get( 'title') == doc.get('title'): cdoc.update(doc) skip = True break if skip: continue try: actors[item['body']][u'docs'].append(doc) except KeyError: actors[item['body']][u'docs'] = [doc] del item['doc'] # merge any fields not yet in the actor actors[item['body']].update([(k, v) for k, v in item.items() if k not in actors[item['body']]]) res.extend([x for x in actors.values() if x]) #pprint.pprint(sorted(res, key=itemgetter('date'))) #pprint.pprint(sorted([dict([(k1,v1) for k1,v1 in v.items() if k1!='text']) for v in res], key=itemgetter('date'))) return res
def merge_events(events,committees,agents): bydate={} for event in events: if not event['date'] in bydate: bydate[event['date']]=[event] else: bydate[event['date']].append(event) #pprint.pprint(sorted([(k,[dict([(k1,v1) for k1,v1 in i.items() if k1!='text']) for i in v]) for k,v in bydate.items()])) res=[] # merge items to events. for date, items in bydate.items(): actors={} # collects items/actor for a given date for item in items: if not item.get('body'): # find out body, or drop body=stage2inst.get(item.get('type')) if body: item[u'body']=body elif item.get('type')=='Final act published in Official Journal': # this really has no body or all res.append(item) continue else: logger.warn('unknown body: %s' % item.get('type')) item[u'body']='unknown' # new institution for this date if not item['body'] in actors: # new body for this date actors[item['body']]=item if 'doc' in actors[item['body']]: docs=merge_new_docs(actors[item['body']]['doc'], item) del actors[item['body']]['doc'] actors[item['body']][u'docs']=docs cmts=getCommittee(item,committees) if cmts: actors[item['body']][u'committees']=sorted(cmts, key=itemgetter('committee')) if item['body']=='EC': actors[u'EC'][u'commission']=sorted([{u'DG': x['dg'], u'Commissioner': x['commissioner']} if x.get('commissioner') else {u'DG': x['dg']} for x in agents if x['body']=='EC']) continue # merge any docs if 'doc' in item: docs=merge_new_docs(item['doc'], item) for doc in docs: skip=False # update docs, that are already in there, but with a different 'type' for cdoc in actors[item['body']].get('docs',[]): if cdoc.get('url')==doc.get('url') or cdoc.get('title')==doc.get('title'): cdoc.update(doc) skip=True break if skip: continue try: actors[item['body']][u'docs'].append(doc) except KeyError: actors[item['body']][u'docs']=[doc] del item['doc'] # merge any fields not yet in the actor actors[item['body']].update([(k,v) for k,v in item.items() if k not in actors[item['body']]]) res.extend([x for x in actors.values() if x]) #pprint.pprint(sorted(res, key=itemgetter('date'))) #pprint.pprint(sorted([dict([(k1,v1) for k1,v1 in v.items() if k1!='text']) for v in res], key=itemgetter('date'))) return res
def getactors(node): res={} ax=[None,[]] for row in node.xpath('.//tr'): cells=row.xpath('./td/p') if not cells: continue # get role Rapporteur|Responsible|Rapporteur for the opinion|Opinions role=cells[0].xpath('text()') if role and unws(role[0]): if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1]) tmp=unws(role[0])[:-1] if tmp=="Rapporteur for the opinion": tmp="Rapporteur" ax=[tmp,[]] tmp=unws((cells[1].xpath('text()') or [''])[0]) if ax[0] in ["Rapporteur", "Rapporteur for the opinion"] and tmp: name=' '.join(tmp.split()[:-1]) item={u'group': tmp.split()[-1][1:-1], u'name': name, u'mepref': getMEPRef(name) } if len(cells)>2: item[u'docs']=getdoclist(cells[2]) ax[1].append(item) continue if ax[0] in ["Opinions", "Responsible"] and tmp: tmp1=tmp.split(u' –',1) if len(tmp1)==2: (comid, rest)=tmp1 elif len(tmp1)==1: if len(tmp1[0])==4 and tmp1[0].isupper(): (comid, rest)=(tmp1,'') elif len(tmp1[0])>4 and tmp1[0][4] in ['-', u'–', u':', u'*'] and tmp1[0][:4].isupper(): (comid, rest)=(tmp1[:4],tmp1[5:]) else: skip=False for com in tmp.split(', '): if com in COMMITTEE_MAP and len(com)==4: ax[1].append({u'comid': com}) skip=True if skip: continue else: logger.warn("[!] unknown committee: %s" % tmp) raise if not comid: logger.warn("[!] unknown committee: %s" % tmp) item={u'comid': comid} if rest==' Decision: no opinion': item[u'response']=u'Decision: no opinion' if not rest and len(comid)>4: for com in comid.split(', '): ax[1].append({u'comid': com}) continue if len(cells)>2: tmp=unws((cells[2].xpath('text()') or [None])[0]) if tmp: name=' '.join(tmp.split()[:-1]) item.update({u'group': tmp.split()[-1][1:-1], u'name': name, u'mepref': getMEPRef(name)}) if len(cells)>3: item[u'docs']=getdoclist(cells[3]) ax[1].append(item) if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1]) return res
def parse_block(block, url, reference, date, committee, rapporteur): am = {u"src": url, u"reference": reference, u"date": date, u"committee": committee} # logger.info(block) # get title try: am[u"seq"] = int(unws(block[0]).split()[1]) except ValueError: am[u"seq"] = unws(block[0]).split()[1] except IndexError: logger.warn("%s wrong seq %s" % (datetime.now().isoformat(), block[0])) am[u"seq"] = unws(block[0]) del block[0] strip(block) # find and strip justification i = len(block) - 1 while i > 2 and not (unws(block[i]) == "Justification" and block[i].startswith(" " * 6)): i -= 1 if i > 2: if i < len(block) - 1 and (not unws(block[i + 1]) or not block[i + 1].startswith(" ")): am["justification"] = "\n".join(block[i + 2 :]) del block[i:] strip(block) else: logger.warn("%s wrong justification\n%s" % (datetime.now().isoformat(), "\n".join(block[i:]))) # get original language if 4 < len(unws(block[-1])) <= 6 and unws(block[-1]).startswith("Or."): am["orig_lang"] = unws(block[-1])[4:] del block[-1] strip(block) # find split column new/old heading i = len(block) - 1 while ( i > 2 and not ( ( block[i].endswith(" Amendment") or block[i].endswith(" PARTICULARS") or block[i].endswith(" Remedy") or block[i].endswith(" Amended text") or block[i].endswith(" Amendement") or block[i].endswith(" Amendments by Parliament") or block[i].endswith(" Proposal for rejection") or block[i].endswith(" Proposal for a rejection") or block[i].endswith(" Does not affect English version") or block[i].endswith(" (Does not affect English version)") or block[i].endswith(" Amendment by Parliament") ) and len(block[i]) > 33 ) and not (unws(block[i]) == "Text proposed by the Commission" or unws(block[i]) in types) ): i -= 1 if i > 2: # if block[i].endswith(" Proposal for rejection"): # pass # location will be possibly '-' seq = False if unws(block[i]) in ["Amendment", "Amendment by Parliament"]: # sequential format, search for preceeding original text j = i while j > 2 and not (unws(block[j]) in types or unws(block[j]) == "Text proposed by the Commission"): j -= 1 if j > 2: i = j seq = True key = "old" elif unws(block[i]) == "Text proposed by the Commission" or block[i].strip() in types: seq = True key = "old" # throw headers del block[i] while i < len(block) and not unws(block[i]): del block[i] # skip blank lines mid = max([len(x) for x in block]) / 2 while i < len(block): if seq: if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]: key = "new" del block[i] continue try: am[key].append(block[i]) except KeyError: am[key] = [block[i]] del block[i] continue # only new, old is empty if block[i].startswith(" "): try: am["new"].append(unws(block[i])) except KeyError: am["new"] = [unws(block[i])] del block[i] continue newstart = block[i].rstrip().rfind(" ") # only old, new is empty if newstart < 6: try: am["old"].append(unws(block[i])) except KeyError: am["old"] = [unws(block[i])] del block[i] continue # mid=len(block[i])/2 # mid=40 lsep = block[i].rfind(" ", 0, mid) # todo calculate both, and use the one closer to the center rsep = block[i].find(" ", mid) sep = None if abs(lsep - mid) < abs(rsep - mid): if abs(lsep - mid) < 15: sep = lsep else: if abs(rsep - mid) < 15: sep = rsep if sep: try: am["old"].append(unws(block[i][:sep])) except KeyError: am["old"] = [unws(block[i][:sep])] try: am["new"].append(unws(block[i][sep:])) except KeyError: am["new"] = [unws(block[i][sep:])] else: # no sane split found # logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(), # (sep, mid, len(block[i]), newstart, block[i]), # block[i][mid-1:mid+2])) # fallback to naive splitting try: am["old"].append(unws(block[i][:newstart])) except KeyError: am["old"] = [unws(block[i][:newstart])] try: am["new"].append(unws(block[i][newstart:])) except KeyError: am["new"] = [unws(block[i][newstart:])] del block[i] strip(block) else: logger.warn("%s no table\n%s" % (datetime.now().isoformat(), "\n".join(block[i:]))) am["content"] = block[i:] return am i = 0 # find end of authors while ( i < len(block) and unws(block[i]) and not unws(block[i]).lower().startswith("compromise") and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts ): i += 1 if i < len(block): if i > 0: names = " ".join(block[:i]) am["authors"] = names # logger.info("names \n%s" % names) # convert to pt mep _ids for text in filter(None, splitNames(names)): mep = getMep(text, None, False) if mep: try: am["meps"].append(mep["UserID"]) except KeyError: am["meps"] = [mep["UserID"]] else: logger.info("fix %s" % text) del block[:i] strip(block) elif rapporteur: am["authors"] = rapporteur for text in filter(None, splitNames(rapporteur)): mep = getMep(text, None, False) if mep: try: am["meps"].append(mep["UserID"]) except KeyError: am["meps"] = [mep["UserID"]] else: logger.info("fix %s" % text) else: logger.info("%s no authors in Amendment %s" % (datetime.now().isoformat(), am["seq"])) else: logger.warn("%s no boundaries in Amendment %s\n%s" % (datetime.now().isoformat(), am["seq"], "\n".join(block))) am["rest"] = block return am # handle compromise info i = 0 while i < len(block) and unws(block[i]) and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts: i += 1 if i < len(block) and i > 0: am["compromise"] = block[:i] del block[:i] strip(block) i = 0 while i < len(block) and unws(block[i]): if unws(block[i]).split()[0] in locstarts: try: am["location"].append((" ".join(block[:i]), unws(block[i]))) except KeyError: am["location"] = [(" ".join(block[:i]), unws(block[i]))] del block[: i + 1] i = 0 else: i += 1 if len(block) > 0 and ((len(block) == 1 or not unws(block[1])) and unws(block[0]) != "1" and "location" in am): am["location"][-1] = (am["location"][-1][0], "%s %s" % (am["location"][-1][1], block[0])) del block[0] strip(block) if block: if not ( (len(block) == 3 and unws(block[0]) == "1" and not unws(block[1]) and block[2].startswith(" ")) or (len(block) == 2 and unws(block[0]) == "1" and block[1].startswith(" ")) ): # ignore obvious footnotes logger.info("rest in Amendment %s\n%s" % (am["seq"], "\n".join(block))) return am
def scrape(url, comid): root=fetch(url) lines=[x for x in root.xpath('//td[@class="contents"]/div/*') if unws(' '.join(x.xpath('.//text()')))] if not len(lines): return if not unws(' '.join(lines[2].xpath('.//text()')))=='DRAFT AGENDA': logger.error("NOT DRAFT AGENDA %s" % unws(' '.join(lines[2].xpath('.//text()')))) agenda={u'committee': comid, u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))), u'src': url, } i=1 if unws(' '.join(lines[3].xpath('.//text()')))=="INTERPARLIAMENTARY COMMITTEE MEETING": logger.warn("skipping interparl com meet") return if unws(' '.join(lines[6].xpath('.//text()'))).startswith('Room'): agenda.update({u'docid': unws(' '.join(lines[1].xpath('.//text()'))), u'type': unws(' '.join(lines[3].xpath('.//text()'))), u'time': toTime(unws(' '.join(lines[4].xpath('.//text()')))), u'city': unws(' '.join(lines[5].xpath('.//text()'))), u'room': unws(' '.join(lines[6].xpath('.//text()')))[6:], }) i=7 itemcnt=0 item={} schedule=None res=[] while i < len(lines): line=lines[i] i+=1 txt=unws(' '.join(line.xpath('.//text()'))) if txt in ['* * *', '***']: continue # skip end of schedule block # 20 December 2011, 16.00 – 16.30 tmp=toTime(txt) if tmp: schedule=tmp if i<len(lines) and unws(' '.join(lines[i].xpath('.//text()'))) == 'In camera': schedule[u'incamera']=True i+=1 continue if line.tag=='div': item[u'actors']=getactors(line) continue firsttoken=txt.split()[0] # 6. Alternative dispute resolution for consumer disputes and # amending Regulation (EC) No 2006/2004 and Directive # 2009/22/EC (Directive on consumer ADR) if firsttoken[-1]=='.' and firsttoken[:-1].isdigit() and itemcnt+1==int(firsttoken[:-1]): if item: res.append(item) itemcnt+=1 item=copy.deepcopy(agenda) item.update({u'title': ' '.join(txt.split()[1:]), u'seq_no': itemcnt,}) if schedule: item.update(schedule) continue # trailing list of "details" # · Presentation by the Commission of the proposal & Impact Assessment # · Exchange of views if firsttoken==u"·": if not 'list' in item: item[u'list']=[] tmp=' '.join(txt.split()[1:]) if tmp.startswith('Deadline for tabling amendments:'): try: item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d %B %Y, %H.%M") except ValueError: try: item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d.%m.%Y at %H.%M") except: logger.warn('[$] unknown tabling deadline format %s' % unws(tmp)) item[u'list'].append(tmp) continue # committee dossier # IMCO/7/08130 if txt.startswith("%s/7/" % comid) and len(txt)==12: item[u'comdossier']=txt continue # ***I 2011/0373(COD) COM(2011)0793 – C7-0454/2011 tmp=getdocs(txt) if tmp: item.update(tmp) continue # fall-through line logger.debug("(falltrough) %s %s" % (line.tag, txt.encode('utf8'))) if item: res.append(item) return res
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/get.html' % userid logger.info("scraping %s" % url) root = fetch(url) data = {u'active': True, 'meta': {u'url': url}} # return {'active': False} mepdiv=root.xpath('//div[@class="ep_elementpeople2"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0])) data[u'Photo'] = unicode(urljoin(BASE_URL,mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')),'utf8') (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split(',', 1) try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), "Born on %d %B %Y"), u'place': unws(p) } except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) const={u'country': unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0])} data[u'Constituencies']=[const] try: const[u'party']=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]), except IndexError: data[u'active']=False else: group=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0]) data[u'Groups'] = [{ u'role': unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]), u'group': group, u'groupid': group_map[group]}] cdiv=root.xpath('//div[@class="ep_elementcontact"]') if len(cdiv): addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')]) addif(data,u'Homepage',[unicode(x.get('href'),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')]) addif(data,u'Mail',[decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))]) for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'): title=unws(''.join(span.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: addif(data,title,[unws(x) for x in span.xpath('../../..//li/div/text()')]) addif(data,u'Addresses',getAddress(root)) for div in root.xpath('//div[@class="ep_content"]'): key=unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()'))) if not len(key): continue elif key.lower()=='curriculum vitae': data[u'CV'] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')] elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President']: for span in div.xpath('.//span[@class="commission_label"]'): item={u'role': key, u'abbr': unws(''.join(span.xpath('text()'))), u'Organization': unws(span.tail)} for start, field in orgmaps: if item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break else: logger.error('[!] unknown field %s' % key) return data
def scrape(celexid, path): logger.info("scraping %s%s:NOT" % (EURLEXURL, celexid)) path.reverse() (code, lang) = celexid.split(":")[1:3] st = 6 if len(code) > 6: if code[6].isalpha(): st = 7 eurlex = { 'id': { u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:st], u'refno': code[st:], u'lang': lang, } } else: eurlex = { 'id': { u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:6], u'lang': lang, } } try: eurlex['id'][u'typeDesc'] = CELEXCODES[code[0]]['Document Types'][ code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector'] except: eurlex['id'][u'typeDesc'] = u"Unknown" logger.warn("[!] unknown typedesc %s" % celexid) eurlex['meta'] = {u'src': "%s%s:NOT" % (EURLEXURL, celexid)} root = fetch("%s%s:NOT" % (EURLEXURL, celexid)) if len(root.xpath('//h1[text()="No documents matching criteria."]')) > 0: logger.warn('[!] nothing to scrape here: %s', "%s%s:NOT" % (EURLEXURL, celexid)) return eurlex[u'title'] = unws( root.xpath( '//h2[text()="Title and reference"]/following-sibling::p/text()') [0]) # dates dates = root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()') for y in dates: if not unws(y): continue title, rest = unws(y).split(": ", 1) item = {} date = rest[:10] tail = rest[10:] if tail.startswith('; '): tail = tail[2:] if date == '99/99/9999': item[u'date'] = datetime(9999, 12, 31) elif date == '00/00/0000': item[u'date'] = datetime(0001, 01, 01) elif date == '//': continue else: try: item[u'date'] = datetime.strptime(date, u"%d/%m/%Y") except ValueError: try: item[u'date'] = datetime.strptime(date, u"%m/%d/%Y") except: pass if len(tail): item['note'] = tail try: eurlex['dates'][title] = item except: eurlex['dates'] = {title: item}
def scrape_epagents(table): heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip() responsible=None if heading in [ "Committee responsible", "Former committee responsible"]: responsible=True elif heading in ["Committee for opinion", "Former committee for opinion"]: responsible=False else: logger.warn(u"[!] unknown committee heading %s" % heading) # handle shadows shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a') tips=[t.xpath('text()')[0] if len(t.xpath('text()'))>0 else groupurlmap[t.xpath("a")[0].get('href')] for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')] shadows={} for shadow, group in izip_longest(shadowelems, tips): committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0] if not committee in shadows: shadows[committee]=[] if group=='NI': group=u'NI' mep={u'name': unicode(shadow.xpath('text()')[0]), u'group': unicode(group)} tmp=getMEPRef(shadow.xpath('text()')[0]) if tmp: mep[u'mepref']=tmp #else: # raise IndexError shadows[committee].append(mep) # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'): parent=todel.xpath('..')[0] parent.remove(todel.xpath('following-sibling::div')[0]) parent.remove(todel) # handle each row of agents agents=[] for agent in lst2obj(table,epagents,1): agent[u'responsible']=responsible agent[u'body']=u'EP' if agent.get('rapporteur'): meps=[] for mep in agent['rapporteur']: if unws(mep['name']).startswith("The committee decided not to give an opinion"): del agent['rapporteur'][agent['rapporteur'].index(mep)] agent[u'opinion']=None continue tmp=getMEPRef(mep['name']) if tmp: meps.append({u'mepref': tmp, u'group': mep['group'], u'name': mep['name']}) else: meps.append({u'group': mep['group'], u'name': mep['name']}) agent[u'rapporteur']=meps abbr=agent['committee'][:4] if abbr=='BUDE': abbr='BUDG' if not abbr in COMMITTEE_MAP.keys(): logger.warn(u"[!] uknown committee abbrev %s" % abbr) agent[u'committee_full']=agent['committee'] if agent['committee'][4]==' ' and abbr.isalpha(): agent[u'committee']=abbr else: agent[u'committee_full']=agent['committee'][5:] agent[u'committee']=abbr if agent.get(u'committee') in shadows.keys(): agent[u'shadows']=shadows[agent['committee']] if not agent in agents: agents.append(agent) return agents
def scrape_epagents(table): heading = ''.join( table.xpath('.//td[@class="players_committee"]')[0].xpath( ".//text()")).strip() responsible = None if heading in ["Committee responsible", "Former committee responsible"]: responsible = True elif heading in ["Committee for opinion", "Former committee for opinion"]: responsible = False else: logger.warn(u"[!] unknown committee heading %s" % heading) # handle shadows shadowelems = table.xpath( '//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a' ) tips = [ t.xpath('text()')[0] if len(t.xpath('text()')) > 0 else groupurlmap[t.xpath("a")[0].get('href')] for t in table.xpath( '//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]' ) ] shadows = {} for shadow, group in izip_longest(shadowelems, tips): committee = shadow.xpath( './ancestor::td/preceding-sibling::td//acronym/text()')[0] if not committee in shadows: shadows[committee] = [] if group == 'NI': group = u'NI' mep = {u'name': unicode(shadow.xpath('text()')[0]), u'group': group} tmp = getMEPRef(shadow.xpath('text()')[0]) if tmp: mep[u'mepref'] = tmp else: raise IndexError shadows[committee].append(mep) # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'): parent = todel.xpath('..')[0] parent.remove(todel.xpath('following-sibling::div')[0]) parent.remove(todel) # handle each row of agents agents = [] for agent in lst2obj(table, epagents, 1): agent[u'responsible'] = responsible agent[u'body'] = u'EP' if agent.get('rapporteur'): meps = [] for mep in agent['rapporteur']: if unws(mep['name']).startswith( "The committee decided not to give an opinion"): del agent['rapporteur'][agent['rapporteur'].index(mep)] agent[u'opinion'] = None continue tmp = getMEPRef(mep['name']) if tmp: meps.append({ u'mepref': tmp, u'group': mep['group'], u'name': mep['name'] }) else: raise IndexError agent[u'rapporteur'] = meps abbr = agent['committee'][:4] if abbr == 'BUDE': abbr = 'BUDG' if not abbr in COMMITTEE_MAP.keys(): logger.warn(u"[!] uknown committee abbrev %s" % abbr) agent[u'committee_full'] = agent['committee'] if agent['committee'][4] == ' ' and abbr.isalpha(): agent[u'committee'] = abbr else: agent[u'committee_full'] = agent['committee'][5:] agent[u'committee'] = abbr if agent.get(u'committee') in shadows.keys(): agent[u'shadows'] = shadows[agent['committee']] if not agent in agents: agents.append(agent) return agents