def scrape_epagents(table): heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip() responsible=None if heading in [ "Committee responsible", "Former committee responsible"]: responsible=True elif heading in ["Committee for opinion", "Former committee for opinion"]: responsible=False else: logger.warn(u"[!] unknown committee heading %s" % heading) # handle shadows shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a') tips=[t.xpath('text()')[0] if len(t.xpath('text()'))>0 else groupurlmap[t.xpath("a")[0].get('href')] for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')] shadows={} for shadow, group in izip_longest(shadowelems, tips): committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0] if not committee in shadows: shadows[committee]=[] if group=='NI': group=u'NI' mep={u'name': unicode(shadow.xpath('text()')[0]), u'group': unicode(group)} tmp=getMEPRef(shadow.xpath('text()')[0]) if tmp: mep[u'mepref']=tmp #else: # raise IndexError shadows[committee].append(mep) # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'): parent=todel.xpath('..')[0] parent.remove(todel.xpath('following-sibling::div')[0]) parent.remove(todel) # handle each row of agents agents=[] for agent in lst2obj(table,epagents,1): agent[u'responsible']=responsible agent[u'body']=u'EP' if agent.get('rapporteur'): meps=[] for mep in agent['rapporteur']: if unws(mep['name']).startswith("The committee decided not to give an opinion"): del agent['rapporteur'][agent['rapporteur'].index(mep)] agent[u'opinion']=None continue tmp=getMEPRef(mep['name']) if tmp: meps.append({u'mepref': tmp, u'group': mep['group'], u'name': mep['name']}) else: meps.append({u'group': mep['group'], u'name': mep['name']}) agent[u'rapporteur']=meps abbr=agent['committee'][:4] if abbr=='BUDE': abbr='BUDG' if not abbr in COMMITTEE_MAP.keys(): logger.warn(u"[!] uknown committee abbrev %s" % abbr) agent[u'committee_full']=agent['committee'] if agent['committee'][4]==' ' and abbr.isalpha(): agent[u'committee']=abbr else: agent[u'committee_full']=agent['committee'][5:] agent[u'committee']=abbr if agent.get(u'committee') in shadows.keys(): agent[u'shadows']=shadows[agent['committee']] if not agent in agents: agents.append(agent) return agents
def scrape_epagents(table): heading = "".join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip() responsible = None if heading in ["Committee responsible", "Former committee responsible"]: responsible = True elif heading in ["Committee for opinion", "Former committee for opinion"]: responsible = False else: print "[!] unknown committee heading", heading # remove tooltips [tip.xpath("..")[0].remove(tip) for tip in table.xpath('.//span[@class="tiptip"]')] # handle shadows shadowelems = table.xpath( '//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a' ) shadows = {} for shadow in shadowelems: committee = shadow.xpath("./ancestor::td/preceding-sibling::td//acronym/text()")[0] if not committee in shadows: shadows[committee] = [] mep = {u"name": shadow.xpath("text()")[0]} tmp = getMEPRef(shadow.xpath("text()")[0]) if tmp: mep[u"mepref"] = tmp else: raise IndexError shadows[committee].append(mep) # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'): parent = todel.xpath("..")[0] parent.remove(todel.xpath("following-sibling::div")[0]) parent.remove(todel) # handle each row of agents agents = [] for agent in lst2obj(table, epagents, 1): agent[u"responsible"] = responsible agent[u"body"] = "EP" if agent.get("rapporteur", [""])[0].strip().startswith("The committee decided not to give an opinion"): del agent["rapporteur"] agent[u"opinion"] = None elif agent.get("rapporteur"): meps = [] for mep in agent["rapporteur"]: tmp = getMEPRef(mep) if tmp: meps.append({u"mepref": tmp, u"name": mep}) else: raise IndexError agent[u"rapporteur"] = meps abbr = agent["committee"][:4] if not abbr in COMMITTEE_MAP.keys(): print "[!] uknown committee abbrev", abbr agent[u"committee_full"] = agent["committee"] del agent["committee"] else: agent[u"committee_full"] = agent["committee"][4:] agent[u"committee"] = abbr if agent.get(u"committee") in shadows.keys(): agent[u"shadows"] = shadows[agent["committee"]] if not agent in agents: agents.append(agent) return agents
def scrape_epagents(table): heading = ''.join( table.xpath('.//td[@class="players_committee"]')[0].xpath( ".//text()")).strip() responsible = None if heading in ["Committee responsible", "Former committee responsible"]: responsible = True elif heading in ["Committee for opinion", "Former committee for opinion"]: responsible = False else: print "[!] unknown committee heading", heading # remove tooltips [ tip.xpath('..')[0].remove(tip) for tip in table.xpath('.//span[@class="tiptip"]') ] # handle shadows shadowelems = table.xpath( '//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a' ) shadows = {} for shadow in shadowelems: committee = shadow.xpath( './ancestor::td/preceding-sibling::td//acronym/text()')[0] if not committee in shadows: shadows[committee] = [] mep = {u'name': shadow.xpath('text()')[0]} tmp = getMEPRef(shadow.xpath('text()')[0]) if tmp: mep[u'mepref'] = tmp else: raise IndexError shadows[committee].append(mep) # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'): parent = todel.xpath('..')[0] parent.remove(todel.xpath('following-sibling::div')[0]) parent.remove(todel) # handle each row of agents agents = [] for agent in lst2obj(table, epagents, 1): agent[u'responsible'] = responsible agent[u'body'] = 'EP' if agent.get('rapporteur', [''])[0].strip().startswith( "The committee decided not to give an opinion"): del agent['rapporteur'] agent[u'opinion'] = None elif agent.get('rapporteur'): meps = [] for mep in agent['rapporteur']: tmp = getMEPRef(mep) if tmp: meps.append({u'mepref': tmp, u'name': mep}) else: raise IndexError agent[u'rapporteur'] = meps abbr = agent['committee'][:4] if not abbr in COMMITTEE_MAP.keys(): print "[!] uknown committee abbrev", abbr agent[u'committee_full'] = agent['committee'] del agent['committee'] else: agent[u'committee_full'] = agent['committee'][4:] agent[u'committee'] = abbr if agent.get(u'committee') in shadows.keys(): agent[u'shadows'] = shadows[agent['committee']] if not agent in agents: agents.append(agent) return agents
def scrape_epagents(table): heading = ''.join( table.xpath('.//td[@class="players_committee"]')[0].xpath( ".//text()")).strip() responsible = None if heading in ["Committee responsible", "Former committee responsible"]: responsible = True elif heading in ["Committee for opinion", "Former committee for opinion"]: responsible = False else: logger.warn(u"[!] unknown committee heading %s" % heading) # handle shadows shadowelems = table.xpath( '//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a' ) tips = [ t.xpath('text()')[0] if len(t.xpath('text()')) > 0 else groupurlmap[t.xpath("a")[0].get('href')] for t in table.xpath( '//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]' ) ] shadows = {} for shadow, group in izip_longest(shadowelems, tips): committee = shadow.xpath( './ancestor::td/preceding-sibling::td//acronym/text()')[0] if not committee in shadows: shadows[committee] = [] if group == 'NI': group = u'NI' mep = {u'name': unicode(shadow.xpath('text()')[0]), u'group': group} tmp = getMEPRef(shadow.xpath('text()')[0]) if tmp: mep[u'mepref'] = tmp else: raise IndexError shadows[committee].append(mep) # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'): parent = todel.xpath('..')[0] parent.remove(todel.xpath('following-sibling::div')[0]) parent.remove(todel) # handle each row of agents agents = [] for agent in lst2obj(table, epagents, 1): agent[u'responsible'] = responsible agent[u'body'] = u'EP' if agent.get('rapporteur'): meps = [] for mep in agent['rapporteur']: if unws(mep['name']).startswith( "The committee decided not to give an opinion"): del agent['rapporteur'][agent['rapporteur'].index(mep)] agent[u'opinion'] = None continue tmp = getMEPRef(mep['name']) if tmp: meps.append({ u'mepref': tmp, u'group': mep['group'], u'name': mep['name'] }) else: raise IndexError agent[u'rapporteur'] = meps abbr = agent['committee'][:4] if abbr == 'BUDE': abbr = 'BUDG' if not abbr in COMMITTEE_MAP.keys(): logger.warn(u"[!] uknown committee abbrev %s" % abbr) agent[u'committee_full'] = agent['committee'] if agent['committee'][4] == ' ' and abbr.isalpha(): agent[u'committee'] = abbr else: agent[u'committee_full'] = agent['committee'][5:] agent[u'committee'] = abbr if agent.get(u'committee') in shadows.keys(): agent[u'shadows'] = shadows[agent['committee']] if not agent in agents: agents.append(agent) return agents