def getComAms(leg=TERM, update=False): #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html" urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html" # todo add to searchRPCD, OPCD for doctype in ['AMCO', 'RPCD', 'OPCD']: postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype) nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']): url=urltpl % (com) i=0 logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com)) root=fetch(url, params=postdata) prev=[] while True: logger.info("%s %s" % (datetime.now().isoformat(), url)) #logger.info(tostring(root)) tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None for a in root.xpath('//a[@title="open this PDF in a new window"]') if (len(a.get('href',''))>13)} if not tmp or prev==tmp: break prev=tmp for u, v in sorted(tmp.items()): if db.ep_ams.find_one({'src': u}): continue yield u, v if update: break i+=1 url=nexttpl % (com,i) root=fetch(url)
def getComAgendas(): #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html" urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html" postdata="docType=AGEN&leg=8&miType=text&tabActif=tabResult#sidesForm" #nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm" nexttpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html?action=%s&tabActif=tabResult#sidesForm" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']): url=urltpl % (com) i=0 agendas=[] logger.info('scraping %s' % com) root=fetch(url, params=postdata) prev=[] while True: logger.info("%s %s" % (datetime.now().isoformat(), url)) tmp=[(a.get('href'), unws(a.xpath('text()')[0])) for a in root.xpath('//p[@class="title"]/a') if len(a.get('href',''))>13] if not tmp or prev==tmp: break prev=tmp for u,title in tmp: if title.startswith('DRAFT AGENDA'): yield (u,com) i+=1 url=nexttpl % (com,i) root=fetch(url)
def getComAms(leg=7, update=False): urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html" # todo add to searchRPCD, OPCD for doctype in ['AMCO', 'RPCD', 'OPCD']: postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype) nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']): url=urltpl % (com) i=0 logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com)) root=fetch(url, params=postdata) prev=[] while True: logger.info("%s %s" % (datetime.now().isoformat(), url)) #logger.info(tostring(root)) tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None for a in root.xpath('//a[@title="open this PDF in a new window"]') if (len(a.get('href',''))>13)} if not tmp or prev==tmp: break prev=tmp for u, v in sorted(tmp.items()): if db.ep_ams.find_one({'src': u}): continue yield u, v if update: break i+=1 url=nexttpl % (com,i) root=fetch(url)
def getComAgendas(): urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text" nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']): url=urltpl % (com) i=0 agendas=[] logger.info('scraping %s' % com) while True: logger.info("crawling %s" % (url)) root=fetch(url) tmp=[(a.get('href'), unws(a.xpath('text()')[0])) for a in root.xpath('//p[@class="title"]/a') if len(a.get('href',''))>13] if not tmp: break for u,_ in tmp: yield (u,com) i+=10 url=nexttpl % (com,i)
def getComAgendas(): urltpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text" nexttpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s" for com in (k for k in COMMITTEE_MAP.keys() if len(k) == 4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']): url = urltpl % (com) i = 0 agendas = [] logger.info('scraping %s' % com) while True: logger.info("crawling %s" % (url)) root = fetch(url) tmp = [(a.get('href'), unws(a.xpath('text()')[0])) for a in root.xpath('//p[@class="title"]/a') if len(a.get('href', '')) > 13] if not tmp: break for u, _ in tmp: yield (u, com) i += 10 url = nexttpl % (com, i)
def scrape_epagents(table): heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip() responsible=None if heading in [ "Committee responsible", "Former committee responsible"]: responsible=True elif heading in ["Committee for opinion", "Former committee for opinion"]: responsible=False else: logger.warn(u"[!] unknown committee heading %s" % heading) # handle shadows shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a') tips=[t.xpath('text()')[0] if len(t.xpath('text()'))>0 else groupurlmap[t.xpath("a")[0].get('href')] if len(t.xpath("a"))>0 else groupurlmap[t.xpath("img")[0].get('src')] for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')] shadows={} for shadow, group in izip_longest(shadowelems, tips): committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0] if not committee in shadows: shadows[committee]=[] if group=='NI': group=u'NI' mep={u'name': unicode(shadow.xpath('text()')[0]), u'group': unicode(group)} tmp=getMEPRef(shadow.xpath('text()')[0]) if tmp: mep[u'mepref']=tmp #else: # raise IndexError shadows[committee].append(mep) # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'): parent=todel.xpath('..')[0] parent.remove(todel.xpath('following-sibling::div')[0]) parent.remove(todel) # handle each row of agents agents=[] for agent in lst2obj(table,epagents,1): agent[u'responsible']=responsible agent[u'body']=u'EP' if agent.get('rapporteur'): meps=[] for mep in agent['rapporteur']: if unws(mep['name']).startswith("The committee decided not to give an opinion"): del agent['rapporteur'][agent['rapporteur'].index(mep)] agent[u'opinion']=None continue tmp=getMEPRef(mep['name']) if tmp: meps.append({u'mepref': tmp, u'group': mep['group'], u'name': mep['name']}) else: meps.append({u'group': mep['group'], u'name': mep['name']}) agent[u'rapporteur']=meps abbr=agent['committee'][:4] if abbr=='BUDE': abbr='BUDG' if not abbr in COMMITTEE_MAP.keys(): logger.warn(u"[!] uknown committee abbrev %s" % abbr) agent[u'committee_full']=agent['committee'] if agent['committee'][4]==' ' and abbr.isalpha(): agent[u'committee']=abbr else: agent[u'committee_full']=agent['committee'][5:] agent[u'committee']=abbr if agent.get(u'committee') in shadows.keys(): agent[u'shadows']=shadows[agent['committee']] if not agent in agents: agents.append(agent) return agents