def getactivities(mepid, terms=[8]): urltpl = 'http://www.europarl.europa.eu/meps/en/%s/see_more.html?type=%s&leg=%s&index=%s' #ctjson={'content-type': 'application/json'} actions={} for type in activitymap.keys(): actions[type]={} for term in terms: term=str(term) actions[type][term]=[] idx=0 while True: _url = urltpl % (mepid,type,term,idx) try: res=fetch_raw(_url, ignore=[500]) #, headers=ctjson) except: logger.warn("failed to fetch %s" % _url) break if res is None: break if '<h2>Error while collecting data</h2>' in res: break ret=json.loads(res) actions[type][term].extend(ret['documentList']) idx=ret['nextIndex'] if idx in [-1,0]: break if not actions[type][term]: del actions[type][term] if not actions[type]: del actions[type] return actions
def get_all_dossiers(**kwargs): for year in range(datetime.date.today().year, 1971, -1): tree = fetch( 'https://oeil.secure.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/*\(*\)' % (year)) tmp = tree.xpath( '//span[@class="ep_name" and (starts-with(normalize-space(),"Results found :") or starts-with(normalize-space(),"Result found :"))]/text()' ) if not tmp: log(1, "no dossiers found for %d" % year) raise ValueError("failed to find number of dossiers for year %d" % year) tmp = unws(tmp[0]) count = int(tmp[tmp.index(":") + 1:]) log(4, "year %d, count %d" % (year, count)) #tree=fetch('https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/????\(*\)&lang=en&s1&all&limit=%s&lang=en' # % (year, count), prune_xml=True) tree = fromstring( fetch_raw( 'https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/*\(*\)&lang=en&s1&all&limit=%s&lang=en' % (year, count)).encode("utf8")) items = tree.xpath('//item') i = 0 for item in items: url = html.unescape( urljoin(BASE_URL, str(item.xpath('./link/text()')[0]))) ref = unws(item.xpath('./reference/text()')[0]) if '*' in ref: ref = ref[:ref.index('*')] log(4, 'adding dossier scraping job %s' % url) payload = dict(kwargs) payload['url'] = url add_job('dossier', payload=payload) i += 1 if i != count: log(1, "total %d, expected %d" % (i, count))
def getraw(pdf): (fd, fname)=mkstemp() fd=os.fdopen(fd, 'wb') fd.write(fetch_raw(pdf).encode('utf-8')) fd.close() text=pdftotext('-nopgbrk', '-layout', fname, '-') os.unlink(fname) return text
def getXML(url): try: raw = fetch_raw(url,binary=True) except requests.exceptions.HTTPError as e: if e.response.status_code == 404: return None log(1, "failed to fetch xml from url: %s" % url) raise try: return fromstring(raw) except: log(1, "failed to parse xml from url: %s" % url) raise
def getraw(pdf): log(5, "fetching url: %s" % pdf) (fd, fname)=mkstemp() fd=os.fdopen(fd, 'wb') fd.write(fetch_raw(pdf, binary=True)) fd.close() text=pdftotext('-nopgbrk', '-layout', fname, '-') os.unlink(fname) return text
def crawl(year, term, **kwargs): url = 'https://www.europarl.europa.eu/RegistreWeb/services/search' params = { "references": [], "authors": [], "typesDoc": ["PPVD"], "eurovoc": None, "codeAuthor": None, "fulltext": None, "searchLanguages": ["EN"], "relations": [], "allAuthorities": [], "dateCriteria": { "field": "DATE_DOCU", "startDate": None, "endDate": None }, "sortAndOrder": "DATE_DOCU_ASC" } params['year'] = str(year) params['leg'] = str(term) params["currentPage"] = 1 params["nbRows"] = 10 res = fetch_raw(url, asjson=params, res=True).json() while (len(res.get('documents', [])) > 0): for d in res.get('documents'): if d.get("fragDocu") != "RCV": continue for f in d.get('formatDocs', []): if f.get('typeDoc', '') != 'text/xml': continue if f['url'] in seen: continue seen.add(f['url']) payload = dict(kwargs) payload['url'] = f['url'] #print(payload) add_job('pvote', payload=payload) params["currentPage"] += 1 res = fetch_raw(url, asjson=params, res=True).json()
def _get(template, term, _date): url = template % (_date.strftime("%Y/%m-%d"), term, _date.strftime("(%Y)%m-%d")) try: raw = fetch_raw(url,binary=True) except requests.exceptions.HTTPError as e: if e.response.status_code == 404: return None, None log(1, "failed to fetch xml from url: %s" % url) raise try: xml = fromstring(raw) except: log(1, "failed to parse xml from url: %s" % url) raise return url, xml
def getraw(pdf): (fd, fname)=mkstemp() fd=os.fdopen(fd, 'wb') fd.write(fetch_raw(pdf, binary=True)) fd.close() text=pdftotext(#'-nopgbrk', '-layout', #'-x', x, #'-y', y, #'-H', h, #'-W', w, fname, '-') os.unlink(fname) # remove pagebreaks and footers return unpaginate(text,pdf)
def scrape(id, **kwargs): # we ignore the /meps/en/<id>/<name>/home path, since we can get all info also from other pages url = "http://www.europarl.europa.eu/meps/en/%s/name/cv" % id xml = fetch_raw(url) # we have to patch up the returned html... xml = xml.replace("</br>", "<br/>") # ...it contains some bad tags.. root = fromstring( xml ) # ...which make the lxml soup parser drop some branches in the DOM sidebar_check(root, url) mep = { 'UserID': id, 'Name': mangleName( unws(' '.join( root.xpath('//span[@class="sln-member-name"]/text()'))), id), 'Photo': "https://www.europarl.europa.eu/mepphoto/%s.jpg" % id, 'meta': { 'url': url }, 'Twitter': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Twitter"]/@href' ) ], 'Homepage': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Website"]/@href' ) ], 'Facebook': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Facebook"]/@href' ) ], 'Instagram': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Instagram"]/@href' ) ], 'Mail': [ deobfus_mail(x) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="E-mail"]/@href' ) ], 'Addresses': parse_addr(root), 'active': False, } mep = addchangednames(mep) birthdate = root.xpath('//time[@id="birthDate"]/text()') if len(birthdate) > 0: mep['Birth'] = { 'date': datetime.strptime(unws(birthdate[0]), u"%d-%m-%Y") } place = root.xpath('//time[@id="birthDate"]/following-sibling::text()') if len(place) > 0: tmp = unws(' '.join(place)) if tmp.startswith(", "): tmp = tmp[2:] mep['Birth']['place'] = tmp death = root.xpath('//time[@id="deathDate"]/text()') if death: mep['Death'] = datetime.strptime(unws(death[0]), u"%d-%m-%Y") body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if body.xpath('.//h1[text()="Curriculum vitae "]'): if not body.xpath('.//h3[@id="no_cv_available"]'): mep['CV'] = { 'updated': datetime.strptime( unws( body.xpath( './/p[@class="small"]/strong[contains(text(),"Updated: ")]/text()' )[0]), u"Updated: %d/%m/%Y") } mep['CV'].update({ unws(''.join(title.xpath(".//text()"))): [ unws(''.join(item.xpath(".//text()"))).replace( "-...", "- ...") for item in title.xpath("following-sibling::ul/li") ] for title in body.xpath('.//h4') #if not unws(''.join(title.xpath(".//text()"))).startswith("Original version : ") }) # assistants url = "http://www.europarl.europa.eu/meps/en/%s/name/assistants" % id root = fetch(url) body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if unws(' '.join(body.xpath(".//h1/text()"))) == "Assistants": for h4 in body.xpath('.//h4'): title = unws(''.join(h4.xpath(".//text()"))) assistants = [ unws(''.join(item.xpath(".//text()"))) for item in h4.xpath("../div//span") ] if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in mep: mep['assistants'] = {} title = title.lower().split()[0] if assistants: mep['assistants'][title] = assistants elif title in [ 'Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', 'Trainees', 'Paying agents (grouping)', 'Paying agents', 'Assistants to the Vice-Presidency/to the Quaestorate' ]: if not 'assistants' in mep: mep['assistants'] = {} title = title.lower() if assistants: mep['assistants'][title] = assistants else: log(2, 'unknown title for assistants "{}" {}'.format(title, url)) raise ValueError # declarations root = fetch("http://www.europarl.europa.eu/meps/en/%s/name/declarations" % id) body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if unws(' '.join(body.xpath(".//h1/text()"))) == "Declarations": for title in body.xpath('.//h4'): key = unws(''.join(title.xpath('.//text()'))) if key == 'Declaration of financial interests': key = 'Financial Declarations' mep[key] = [] for pdf in title.xpath('./following-sibling::ul/li/a'): url = pdf.xpath('./@href')[0] try: mep[key].append(findecl.scrape(url)) except: log(1, "failed to extract findecl from %s" % url) elif key == 'Declarations of participation by Members in events organised by third parties': key = 'Declarations of Participation' mep[key] = [] for pdf in title.xpath( './following-sibling::ul/li/a' )[:: -1]: # reversed order, otherwise newer ones get prepended and mess up the diff url = pdf.xpath('./@href')[0] name = unws(''.join(pdf.xpath('.//text()'))) mep[key].append({'title': name, 'url': url}) elif key in [ 'Declaration of good conduct', 'Voluntary confirmation on the use of the General Expenditure Allowance' ]: mep[key] = [] for pdf in title.xpath( './following-sibling::ul/li/a' )[:: -1]: # reversed order, otherwise newer ones get prepended and mess up the diff url = pdf.xpath('./@href')[0] name = unws(''.join(pdf.xpath('.//text()'))) mep[key].append({'title': name, 'url': url}) else: log( 2, 'unknown type of declaration: "%s" http://www.europarl.europa.eu/meps/en/%s/name/declarations' % (key, id)) key = None raise ValueError # history parse_history(id, root, mep) process(mep, id, db.mep, 'ep_meps', mep['Name']['full'], nopreserve=(['Addresses'], ['assistants']), onchanged=onchanged) if __name__ == '__main__': return mep del mep