def extract_table(table, url, date=None): trs = table.xpath('.//tr') header = trs[0] tds = header.xpath('.//td') if len(tds) < 2: log( 1, "vote table has less than two columns in the header: %s %s" % (url, tostring(trs[0]))) raise ValueError type = junws(tds[1]) if type not in {"+", "-", "0"}: log( 1, "vote header type is unexpected value %s in %s" % (repr(type), url)) raise ValueError res = {'total': int(junws(tds[0])), 'type': type, 'meps': []} for tr in trs[1:]: tds = tr.xpath('.//td') if len(tds) < 2: log( 1, "vote table has less than two columns in the body: %s %s" % (url, tostring(tr))) raise ValueError #grp = junws(tds[0]).split() for meps in tds[1].xpath(".//p"): meps = junws(meps) if not meps: continue for m in meps.split(','): m = unws(m) if not m: continue mepid = db.getMep(m, date=date) if not mepid: log(2, "could not resolve MEP name: %s" % m) res['meps'].append(mepid or m) return res
def extract_proc(table, url): res = {} if len(table) < 1: log(1, "could not find procedure table in %s", url) raise ValueError for tr in table[0].xpath('.//tr'): tds = tr.xpath('.//td') title = junws(tds[0]) val = junws(tds[1]) if not title or not val: continue res[title] = val return res
def crawl(term, update=False, test=[], **kwargs): seen = set() url="https://www.europarl.europa.eu/committees/en/documents/search?committeeMnemoCode=%s&textualSearchMode=TITLE&textualSearch=&documentTypeCode=AMCO&reporterPersId=&procedureYear=&procedureNum=&procedureCodeType=&peNumber=&aNumber=&aNumberYear=&documentDateFrom=&documentDateTo=&meetingDateFrom=&meetingDateTo=&performSearch=true&term=%s&page=%s&pageSize={}".format(itemsPerPage) jobs = [] for com in (k for k in test or COMMITTEE_MAP.keys() if len(k)==4): i=0 log(3,'crawling %s, term: %s' % (com, term)) try: root=fetch(url % (com, term, i)) except requests.exceptions.HTTPError as e: #if e.response.status_code == 500: log(3, "failed to get list of amendments for %s in term %d, http error code: %s" % (com, term, e.response.status_code)) continue prev=[] while True: log(3, "crawling amendments search page %s for %s term %s" % (i, com, term)) tmp=[] for a in root.xpath('//a[@class="erpl_document-subtitle-pdf"]'): u=a.get('href','') if (len(u)<=13): log(2,'url is too short, skipping: "%s"' % u) continue if u in seen or u in skipurls or (not u.endswith('EN') and not u.endswith('_EN.pdf')): log(3,"skipping url: %s" % repr(u)) continue seen.add(u) tmp.append(u) rs = a.xpath('../../following-sibling::div/span[@class="erpl_document-subtitle-author"]') r = [y for y in [junws(x) for x in rs] if y] try: payload = dict(kwargs) payload['url'] = u payload['meps'] = r if test: print(payload) else: add_job('amendment', payload=payload) except: print(u, r) if not tmp or prev==tmp or len(tmp) < itemsPerPage: break prev=tmp if update: break i+=1 try: root=fetch(url % (com, term, i)) except requests.exceptions.HTTPError as e: if e.response.status_code == 500: log(3, "failed to page %s of draft agendas for %s in term %d" % (i, com, term)) break
def scrape(url, **kwargs): log(3,"scraping %s" % (url)) root = getXML(url) if root is None: log(1,"could not get votes for", url) return # angrily o/ log(3, "processing plenary votes xml from %s" % url) # root is: #PV.RollCallVoteResults EP.Number="PE 533.923" EP.Reference="P7_PV(2014)04-17" Sitting.Date="2014-04-17" Sitting.Identifier="1598443" votes=[] for vote in root.xpath('//RollCallVote.Result'): # hrmpf, some EP seriously used the braindead Y-d-m format sometimes in vote timestamps :/ time = vote.get('Date') if len(time.split()) == 2: ts = datetime.strptime(time, "%Y-%m-%d %H:%M:%S") else: ts = datetime.strptime(time, "%Y-%m-%d") tmp=vote.get('Identifier') if tmp: voteid = int(tmp) else: tmp = vote.get('Number') if not tmp: log(1, "blimey, could not deduce an id for the vote in %s" % url) raise ValueError("no id for vote in %s" % url) voteid = "%s-%s" % (ts,tmp) title = vote.xpath("RollCallVote.Description.Text") if len(title) != 1: log(2, "holy ambiguity Batman! This vote doesn't have one title, but %d: %d %s" % (len(title), voteid, url)) title="!unknown!" else: title=junws(title[0]) v={u"ts": ts, u"url": url, u"voteid": voteid, u"title": title, 'votes':{}} v.update(votemeta(v['title'], v['ts'])) if 'epref' not in v: ref = vote.xpath("RollCallVote.Description.Text/a/text()") if ref: v['epref']=unws(ref[0]) for type, stype in [('Result.For','+'), ('Result.Against','-'), ('Result.Abstention','0')]: type = vote.xpath(type) if not type: continue if len(type)>1: log(2, "[pff] more than one %s entry in vote (id:%d) in %s" % (stype, v['voteid'], url)) type = type[0] v['votes'][stype]={'total': int(type.get('Number')), 'groups': {}} for group in type.xpath('Result.PoliticalGroup.List'): g = str(group.get('Identifier')) if not g in v['votes'][stype]['groups']: v['votes'][stype]['groups'][g]=[] for tag in ['Member.Name', 'PoliticalGroup.Member.Name']: for mep in group.xpath(tag): m = {} name = junws(mep) mepid = mep.get("PersId") if mepid: mepid = int(mepid) else: mepid = db.getMep(name, v['ts'], abbr=g) if mepid: m['mepid']= mepid #if int(mep.get('MepId')) in ambiguous_meps: # oid = int(mep.get('MepId')) # ambiguous_meps.remove(oid) # log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid)) else: mepid = lost_meps.get(mep.get('MepId')) if mepid: m['mepid']= mepid else: m['name']= name m['obscure_id']=int(mep.get('MepId')) # it's a totally useless and confusing id that is nowhere else used v['votes'][stype]['groups'][g].append(m) # save process(v, v['voteid'], db.vote, 'ep_votes', v['title']) votes.append(v) return votes