def mp(id, term): """Parse MP from his profile webpage.""" if term and term not in terms.keys(): raise ValueError("unknown term '%s'" % term) url = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/poslanec&PoslanecID=%s&CisObdobia=%s' % (id, term) content = scrapeutils.download(url) if 'Unexpected error!' in content: raise RuntimeError("MP with id '%s' does not exist in term '%s'" % (id, term)) html = lxml.html.fromstring(content) result = { 'id': str(id), 'url': url } for div in html.findall('.//div[@class="mp_personal_data"]//div[strong]'): label = div.findtext('strong') value = div.find('span') result[label.lower()] = value.text_content() if value is not None else '' image_url = html.find('.//div[@class="mp_foto"]/img').get('src') image = requests.get(image_url).content with open(os.path.join(BASE_DIR, 'dummy-image.jpg'), 'rb') as f: dummy_image = f.read() result['fotka'] = image_url if image != dummy_image else '' result['členstvo'] = [] ul = html.find('.//span[@id="_sectionLayoutContainer_ctl01_ctlClenstvoLabel"]').getparent().getnext() for li in ul.findall('li'): m = re.search(r'(.*?)\s*\((.*?)\)', li.text) result['členstvo'].append({'meno': m.group(1), 'rola': m.group(2)}) return scrapeutils.plaintext(result)
def current_term(): url = 'http://www.nrsr.sk/web/default.aspx?sid=poslanci' content = scrapeutils.download(url) html = lxml.html.fromstring(content) option = html.find('.//select[@id="_sectionLayoutContainer_ctl01__currentTerm"]/option[@selected]') return option.get('value')
def debate_of_term1(id): """Parse a debate transcript in term 1 format and return list of its paragraphs' text content.""" # download the debate transcript or use a local fixed debate if there is one filename = os.path.join('fixed_debates', 'debate_%s.html' % id) if os.path.exists(filename): with open(filename, 'r') as f: content = f.read() else: url = 'http://www.nrsr.sk/dl/Browser/Document?documentId=%s' % id content = scrapeutils.download(url) if 'Unexpected error!' in content: raise RuntimeError("Debate with id '%s' does not exist" % id) # fix markup and parse to HTML tree content = content.replace('12. 9. 1995<o:p></o:p>', '12. septembra 1995') content = content.replace('<o:p></o:p>', '') html = lxml.html.fromstring(content) # extract paragraph texts, use blank line as paragraph separator result = [] text = '' for par in html.findall('.//p'): line = scrapeutils.plaintext(par.text_content()) if len(line) > 0 and not re.match(r'\w+ deň rokovania', line): text += '\n%s' % line else: if text: result.append(scrapeutils.clear_hyphens(text, '\n')) text = line result.append(scrapeutils.clear_hyphens(text, '\n')) return scrapeutils.plaintext(result)
def speaker(): """Parse current speaker (predseda) of the chamber.""" url = 'http://www.nrsr.sk/web/default.aspx?sid=predseda' content = scrapeutils.download(url) html = lxml.html.fromstring(content) div = html.find(".//div[@id='_sectionLayoutContainer__panelContent']") result = { 'url': url, 'meno': div.find(".//h1").text_content(), } image = div.find('.//img') if image is not None: result['fotka'] = 'http://www.nrsr.sk/web/' + image.get('src') born = div.find("div[@class='article']") if born is not None: result['narodený'] = re.search(r'Narodený: (.*)', born.text_content()).group(1) bio = div.find('table') if bio is not None: result['životopis'] = lxml.html.tostring(bio, encoding='unicode', with_tail=False) return scrapeutils.plaintext(result)
def session_list(term=None): """Parse list of sessions in one term of office of the parliament.""" if term and term not in terms.keys(): raise ValueError("unknown term '%s'" % term) url = 'http://www.nrsr.sk/web/default.aspx?sid=schodze/hlasovanie/schodze' content = scrapeutils.download(url) html = lxml.html.fromstring(content) # scraping for older terms requires another POST request to emulate selectbox choice if term: data = { '_sectionLayoutContainer$ctl01$_termsCombo': term, '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'), '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'), } ext = '|%s' % term content = scrapeutils.download(url, 'POST', data, ext) html = lxml.html.fromstring(content) # pick list items result = { 'url': url, '_items': [] } for li in html.findall('.//div[@id="_sectionLayoutContainer__panelContent"]//ul//li'): a = li.find('a') link = a.get('href') session = { 'číslo': re.search(r'CisSchodze=(\d+)', link).group(1), 'názov': a.text, 'trvanie': re.search(r'\((.+?)\)', li.text_content()).group(1), 'url': 'http://www.nrsr.sk/web/' + link, } result['_items'].append(session) return scrapeutils.plaintext(result)
def mp_list(term=None): """Parse list of MPs.""" if term and term not in terms.keys(): raise ValueError("unknown term '%s'" % term) term = term or max(terms.keys()) url = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/zoznam_abc&ListType=0&CisObdobia=%s' % term content = scrapeutils.download(url) html = lxml.html.fromstring(content) result = { 'url': url, '_items': [{ 'id': re.search(r'PoslanecID=(\d+)', mp.get('href')).group(1), 'meno': mp.text, } for mp in html.findall('.//div[@class="mps_list"]//li/a')] } return scrapeutils.plaintext(result)
def debate_of_terms56(id): """Parse a debate transcript in terms 5-6 format and return its structure.""" # download the debate transcript url = 'http://tv.nrsr.sk/transcript?id=%s' % id content = scrapeutils.download(url) # parse to HTML tree html = lxml.html.fromstring(content) main_block = html.find('body/div') if not len(main_block) and not main_block.text.strip(): result = {'riadky': []} else: # parse headings and individual lines used as paragraphs main_content = lxml.html.tostring(main_block, encoding='unicode', with_tail=False) main_content = main_content[len('<div>'):-len('</div>')] main_content = main_content.replace('<p>', '').replace('</p>', '') result = {'riadky': re.split('<br\s*/?>', main_content)} return scrapeutils.plaintext(result)
def debate_of_terms234(id): """Parse a debate transcript in terms 2-4 format and return list of its paragraphs' text content.""" # download RTF file or use a local fixed debate if there is one filename = os.path.join('fixed_debates', 'debate_%s.rtf' % id) if not os.path.exists(filename): url = 'http://www.nrsr.sk/dl/Browser/Document?documentId=%s' % id rtf = scrapeutils.download(url) filename = os.path.join(scrapeutils.WEBCACHE_PATH, 'debate_%s.rtf' % id) with open(filename, 'w') as f: f.write(rtf) # convert from RTF to HTML using unoconv using LibreOffice content = subprocess.check_output(['unoconv', '-f', 'html', '--stdout', filename]) html = lxml.html.fromstring(content) result = [] for par in html.findall('./body/p'): result.append(par.text_content()) return scrapeutils.plaintext(result)
def old_debates_list(term): """Parse list of debates for the given term of office from NRSR Digital Library. Appropriate for older terms (1.-4.) where debates are not split by speaker.""" if term not in ['1', '2', '3', '4']: raise ValueError("Old style transcripts are not available for term '%s'" % term) base_url = 'http://www.nrsr.sk/dl/Browser/Grid?nodeType=DocType&legId=13&chamberId=0' + \ '&categoryId=1&committeeId=0&documentTypeId=5&folderId=0&meetingNr=' + \ '&termNr=%s' % term result = { 'url': base_url, '_items': [] } page = 0 while True: url = base_url + '&pageIndex=%s' % page content = scrapeutils.download(url) html = lxml.html.fromstring(content) # extract all debates from the current page for tr in html.findall('.//table[@class="resultTable"]//tr'): sequence_number = tr.findtext('td[1]/a') title = tr.find('td[2]/a') doc_id = re.search(r'documentId=(\d+)', title.get('href')) debate = { 'časť': sequence_number, 'názov': title.text, 'url': 'http://www.nrsr.sk' + title.get('href'), 'id': doc_id.group(1) } result['_items'].append(debate) page += 1 pages = html.findtext('.//div[@class="pager"]/span[last()]') if page >= int(pages): break return scrapeutils.plaintext(result)
def deputy_speakers(): """Parse current deputy speakers (podpredsedovia) of the chamber.""" url = 'http://www.nrsr.sk/web/default.aspx?sid=podpredsedovia' content = scrapeutils.download(url) html = lxml.html.fromstring(content) result = [] for div in html.findall(".//div[@class='vicechairman_bigbox']"): name = div.find('.//a') link = name.get('href') id = re.search(r'PoslanecID=(\d+)', link) description = div.find(".//div[@class='vicechairman_description']") result.append({ 'fotka': 'http://www.nrsr.sk/web/' + div.find('.//img').get('src'), 'meno': name.text, 'url': 'http://www.nrsr.sk/web/' + link, 'id': id.group(1), 'kandidoval(a) za': description.find('div[1]/strong').tail, 'narodený(á):': description.find('div[2]/strong').tail, 'národnosť': description.find('div[3]/strong').tail, }) return scrapeutils.plaintext(result)
def new_debates_list(term, since_date=None, until_date=None): """Parse list of debate parts for the given term of office from NRSR web. Appropriate for newer terms (since 5th) where split debates are available. If `since_date` or `until_date` is given in ISO format only the debate parts since/until that date are returned. """ if term not in ['5', '6', '7']: raise ValueError("Parsed transcripts are not available for term '%s'" % term) url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/rozprava' content = scrapeutils.download(url) html = lxml.html.fromstring(content) # a POST request to emulate choice of term in second selectbox and pressing the button data = { '_sectionLayoutContainer$ctl01$_termNr': term, '_sectionLayoutContainer$ctl01$_search': 'Vyhľadať', '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'), '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'), } base_ext = '|new|%s' % term if since_date: data['_sectionLayoutContainer$ctl01$_dateFrom$dateInput'] = since_date + '-00-00-00' base_ext += '|s%s' % since_date if until_date: data['_sectionLayoutContainer$ctl01$_dateTo$dateInput'] = since_date + '-00-00-00' base_ext += '|u%s' % since_date content = scrapeutils.download(url, 'POST', data, base_ext) html = lxml.html.fromstring(content) result = { 'url': url, '_items': [] } page = 1 while True: # extract all debate parts from the current page for tr in html.findall('.//table[@id="_sectionLayoutContainer_ctl01__newDebate"]/tr'): if tr.get('class') in ('pager', 'tab_zoznam_header'): continue session_number = tr.find('td[1]') date = tr.find('td[2]') time_interval = tr.find('td[3]') time = re.search(r'(.*?) - (.*)', time_interval.text) part_type = time_interval.find('em') speaker = tr.find('td[4]') speaker_label = speaker.find('br').tail.strip('( ') debate_part = { 'schôdza': session_number.text.replace('.', ''), 'dátum': date.text, 'trvanie': {'od': time.group(1), 'do': time.group(2)}, 'druh': part_type.text or '', 'osoba': {'meno': speaker.findtext('strong'), 'funkcia': speaker_label} } speaker_link = speaker.find('a') if speaker_link is not None: speaker_url = speaker_link.get('href') id = re.search(r'PoslanecID=(\d+)', speaker_url) debate_part['osoba']['url'] = speaker_url debate_part['osoba']['id'] = id.group(1) for a in tr.findall('td[5]/a'): link = a.get('href') src = a.find('img').get('src') if 'speak' in src: id = re.search(r'id=(\d+)', link) debate_part['video'] = {'url': link, 'id': id.group(1)} elif 'all' in src: debate_part['video_rokovania'] = {'url': link} elif 'rewrite' in src: id = re.search(r'id=(\d+)', link) debate_part['prepis'] = {'url': link, 'id': id.group(1)} else: raise RuntimeError('Unrecognized link in section %s/%s/%s' % (session_number.text, date.text, time_interval.text)) result['_items'].append(debate_part) # test if there is a link to next page current_page = html.find('.//table[@id="_sectionLayoutContainer_ctl01__newDebate"]//tr[1]//span') if current_page is None: break next_page = current_page.getparent().getnext() if next_page is None: break page += 1 # a POST request to emulate pager click data = { '__EVENTTARGET': '_sectionLayoutContainer$ctl01$_newDebate', '__EVENTARGUMENT': 'Page$%s' % page, '_sectionLayoutContainer$ctl01$_termNr': term, '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'), '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'), } ext = base_ext + '|%s' % page content = scrapeutils.download(url, 'POST', data, ext) html = lxml.html.fromstring(content) return scrapeutils.plaintext(result)
try: # get all senators by districts and all political groups baseurl = 'http://senat.cz/' people = [] groups = {} iid = 100001 ## people for i in range(1, 82): print(i) url = "http://senat.cz/senat/volby/hledani/o_obvodu.php?ke_dni=" + time.strftime( "%d") + "." + time.strftime("%m") + "." + time.strftime( "%Y") + "&O=10&kod=" + str(i) domtree = html.fromstring( bytes(scrapeutils.download(url), 'iso-8859-1').decode('utf-8') ) #note: senat.cz incorrectly send headers as iso/latin1 and so requests save it incorrectly - so fixing it here tables = domtree.xpath('//table[@class="tHistory"]') for table in tables: image = baseurl[:-1] + table.xpath('tr/td/img/@src')[0].replace( '_110', '') name = full_name2name(table.xpath('tr/td/img/@alt')[0]) ident = re.search( 'par_3=(\d{1,})', table.xpath('tr/td/a/@href')[0]).group(1).strip() people.append({ 'given_name': name['given_name'], 'family_name': name['family_name'], 'name':
p2id[p['name']] = p['id'] def pp2id(name, date, p2id): if name == 'Jiří Dienstbier': if date < '2011-01-08': return '218' else: return '253' else: return p2id[name] scrapeutils.USE_WEBCACHE = False url = "http://senat.cz/xqw/xervlet/pssenat/hlasa?S=&T=&H=&N=&K=&ID=275&Str=1&Poc=20000" domtree = html.fromstring(scrapeutils.download(url)) scrapeutils.USE_WEBCACHE = True def result2result(r): if r == 'přijato': return 'pass' else: return 'fail' #motions, vote-events, votes: def guess_majority(quorum, present): if int(quorum) == 49: return 'two-thirds representatives majority' if int(quorum) == 41 and int(present) < 81:
def motion(id): """Parse a motion/vote-event with individual votes cast by MPs.""" url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/hlasklub&ID=%s' % id content = scrapeutils.download(url) if 'Unexpected error!' in content: raise RuntimeError("Motion with id '%s' does not exist" % id) html = lxml.html.fromstring(content) panel = html.find('.//div[@id="_sectionLayoutContainer__panelContent"]') motion = panel.find('.//div[@class="voting_stats_summary_full"]') session_link = motion.find('div[1]//a').get('href') counts = panel.find('.//div[@id="_sectionLayoutContainer_ctl01_ctl00__resultsTablePanel"]/div') result = { 'url': url, 'schôdza': { 'číslo': re.search(r'CisSchodze=(\d+)', session_link).group(1), 'obdobie': re.search(r'CisObdobia=(\d+)', session_link).group(1), 'url': 'http://www.nrsr.sk/web/' + session_link, }, 'dátum': motion.findtext('div[2]/span'), 'číslo': motion.findtext('div[3]/span'), 'názov': motion.findtext('div[4]/span'), } res = motion.findtext('div[5]/span') if res: result['výsledok'] = res if counts is not None: result['súčty'] = { 'prítomní': counts.findtext('div[1]/span'), 'hlasujúcich': counts.findtext('div[2]/span'), '[z] za': counts.findtext('div[3]/span'), '[p] proti': counts.findtext('div[4]/span'), '[?] zdržalo sa': counts.findtext('div[5]/span'), '[n] nehlasovalo': counts.findtext('div[6]/span'), '[0] neprítomní': counts.findtext('div[7]/span'), } mps = panel.find('.//div[@id="_sectionLayoutContainer_ctl01__bodyPanel"]') if mps is not None: result['hlasy'] = [] for td in mps.findall('.//td'): if td.get('class') == 'hpo_result_block_title': parl_group = td.text.strip() else: if not td.text: continue vote = td.text[1].lower() a = td.find('a') family_name, _, given_name = a.text.partition(',') link = a.get('href') id = re.search(r'PoslanecID=(\d+)', link) mp = { 'meno': given_name.strip() + ' ' + family_name.strip(), 'klub': parl_group, 'hlas': vote, 'id': id.group(1), 'url': 'http://www.nrsr.sk/web/' + link } result['hlasy'].append(mp) related_docs = panel.findall('./ul/li[img]/a') if related_docs: result['dokumenty'] = [{ 'názov': a.text.strip(), 'url': 'http://www.nrsr.sk/web/' + a.get('href') } for a in related_docs] return scrapeutils.plaintext(result)
def session(session_number, term=None): """Parse a session, i.e. the list of voted motions.""" if term and term not in terms.keys(): raise ValueError("unknown term '%s'" % term) term = term or max(terms.keys()) if not session_number.isdigit() or int(session_number) == 0: raise ValueError("Invalid session number '%s'" % session_number) url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/vyhladavanie_vysledok' + \ '&ZakZborID=13&CisObdobia=%s&CisSchodze=%s&ShowCisloSchodze=False' % \ (term, session_number) content = scrapeutils.download(url) result = { 'url': url, '_items': [] } if 'V systéme nie sú evidované žiadne hlasovania vyhovujúce zadanej požiadavke.' in content: return result html = lxml.html.fromstring(content) page = 1 while True: # extract all motions from the current page for tr in html.findall('.//table[@id="_sectionLayoutContainer_ctl01__resultGrid2"]/tr'): if tr.get('class') in ('pager', 'tab_zoznam_header'): continue date = tr.find('td[1]') vote_event = tr.find('td[2]/a') vote_event_link = vote_event.get('href') id = re.search(r'ID=(\d+)', vote_event_link) motion = { 'dátum': date.text_content(), 'číslo': vote_event.text_content(), 'názov': tr.findtext('td[4]'), 'id': id.group(1), 'url': { 'výsledok': 'http://www.nrsr.sk/web/' + vote_event_link, } } object = tr.find('td[3]/a') if object is not None: motion['čpt'] = { 'číslo': object.text_content(), 'url': 'http://www.nrsr.sk/web/' + object.get('href') } vote_link2 = tr.find('td[5]/a').get('href') if vote_link2: motion['url']['kluby'] = 'http://www.nrsr.sk/web/' + vote_link2 result['_items'].append(motion) current_page = html.find('.//table[@id="_sectionLayoutContainer_ctl01__resultGrid2"]/tr[1]//span') if current_page is None: break next_page = current_page.getparent().getnext() if next_page is None: break # POST request to emulate pager click page += 1 data = { '__EVENTTARGET': '_sectionLayoutContainer$ctl01$_resultGrid2', '__EVENTARGUMENT': 'Page$%s' % page, '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'), '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'), } ext = '|%s|%s' % (term, page) content = scrapeutils.download(url, 'POST', data, ext) html = lxml.html.fromstring(content) return scrapeutils.plaintext(result)
update = False print("not updating: " + r['_items'][0]['id']) except: nothing = 0 if update: vpapi.delete("memberships",r['_items'][0]['id']) self['id'] = r['_items'][0]['id'] r = vpapi.post('memberships', self) print("updating: " + self['id']) # r = vpapi.put('memberships/%s' % r['_items'][0]['id'],self) if r['_status'] != 'OK': raise Exception(self.name, r) zfile = scrapeutils.download('http://www.psp.cz/eknih/cdrom/opendata/poslanci.zip',zipped=True) zarazeni = scrapeutils.zipfile2rows(zfile,'zarazeni.unl') from datetime import datetime i = 0 for row in zarazeni: r_org = vpapi.get('organizations', where={'identifiers': {'$elemMatch': {"identifier": row[1].strip(), "scheme": "psp.cz/organy"}}}) if len(r_org["_items"]) > 0: r_pers = vpapi.get('people', where={'identifiers': {'$elemMatch': {"identifier": row[0].strip(), "scheme": "psp.cz/osoby"}}}) if len(r_pers["_items"]) > 0: membership = { "label": "Člen", "role": "member", "person_id": r_pers["_items"][0]['id'], "organization_id": r_org["_items"][0]['id'], # "id": str(i),
"vote_event_id": r_voteevent["_items"][0]["id"], } try: votes[r_voteevent["_items"][0]["id"]] except: votes[r_voteevent["_items"][0]["id"]] = [] votes[r_voteevent["_items"][0]["id"]].append(vote.copy()) # for k in votes: # vpapi.post("votes",votes[k]) vpapi.post("votes", votes) j = 0 for term in terms: print(term) zfile = scrapeutils.download("http://www.psp.cz/eknih/cdrom/opendata/hl-" + str(term) + "ps.zip", zipped=True) # hl_hlasovani = scrapeutils.zipfile2rows(zfile,'hl'+str(term)+'s.unl') for i in range(1, 4): print(i) try: hl_poslanec = scrapeutils.zipfile2rows(zfile, "hl" + str(term) + "h" + str(i) + ".unl") # savevotes(hl_poslanec) # savevotes(hl_poslanec) votes = {} votesli = [] voteevents = {} people = {} organizations = {} terms = {} for rowp in hl_poslanec: if rowp[1] in rosnicka_vote_events:
logname = datetime.utcnow().strftime('%Y-%m-%d-%H%M%S') + '.log' logname = os.path.join(LOGS_DIR, logname) logname = os.path.abspath(logname) logging.basicConfig(level=logging.DEBUG, format='%(message)s', handlers=[logging.FileHandler(logname, 'w', 'utf-8')]) logging.getLogger('requests').setLevel(logging.ERROR) logging.info('Started') db_log = vpapi.post('logs', {'status': 'running', 'file': logname, 'params': []}) terms = [1993, 1996, 1998, 2002, 2006, 2010, 2013] terms = [2013] test = {} #terms = [2010] for term in terms: zfile = scrapeutils.download('http://www.psp.cz/eknih/cdrom/opendata/hl-'+str(term)+'ps.zip',zipped=True) hl_hlasovani = scrapeutils.zipfile2rows(zfile,'hl'+str(term)+'s.unl') saveallmotionsandvoteevents(hl_hlasovani) #j = 0 #last_ve_id = 0 #voteevents = {} #people = {} #organizations = {} #for term in terms: # logging.info('Started year ' + str(term)) # print('http://www.psp.cz/eknih/cdrom/opendata/hl-'+str(term)+'ps.zip') # zfile = scrapeutils.download('http://www.psp.cz/eknih/cdrom/opendata/hl-'+str(term)+'ps.zip',zipped=True) # #hl_hlasovani = scrapeutils.zipfile2rows(zfile,'hl'+str(term)+'s.unl')
def change_list(term=None): """Parse list of chamber membership changes.""" term = term or max(terms.keys()) if term not in terms.keys(): raise ValueError("unknown term '%s'" % term) url = 'http://www.nrsr.sk/web/default.aspx?sid=poslanci/zmeny' content = scrapeutils.download(url) html = lxml.html.fromstring(content) # POST request to emulate term selection data = { '__EVENTTARGET': '_sectionLayoutContainer$ctl01$_currentTerm', '_sectionLayoutContainer$ctl01$_currentTerm': term, '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'), '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'), } ext = '|%s|1' % term content = scrapeutils.download(url, 'POST', data, ext) html = lxml.html.fromstring(content) result = { 'url': url, '_items': [] } page = 1 while True: # extract all changes from the current page for tr in html.findall('.//table[@id="_sectionLayoutContainer_ctl01__ResultGrid2"]/tr'): if tr.get('class') in ('pager', 'tab_zoznam_header'): continue date = tr.findtext('td[1]') poslanec = tr.find('td[2]') text = re.search(r'(\S.*?)\s*\((.*?)\)', poslanec.text_content()) link = poslanec.find('a').get('href') id = re.search(r'PoslanecID=(\d+)', link) result['_items'].append({ 'dátum': tr.findtext('td[1]'), 'poslanec': { 'meno': text.group(1), 'url': 'http://www.nrsr.sk/web/' + link, 'id': id.group(1), 'klub': text.group(2), }, 'zmena': tr.findtext('td[3]'), 'dôvod': tr.findtext('td[4]'), }) current_page = html.find('.//table[@id="_sectionLayoutContainer_ctl01__ResultGrid2"]/tr[1]//span') if current_page is None: break next_page = current_page.getparent().getnext() if next_page is None: break # POST request to emulate pager click page += 1 data = { '__EVENTTARGET': '_sectionLayoutContainer$ctl01$_ResultGrid2', '_sectionLayoutContainer$ctl01$_currentTerm': term, '__EVENTARGUMENT': 'Page$%s' % page, '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'), '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'), } ext = '|%s|%s' % (term, page) content = scrapeutils.download(url, 'POST', data, ext) html = lxml.html.fromstring(content) return scrapeutils.plaintext(result)
def group(type, id): """Parse group of a given type (committee, parliamentary group, delegation, friendship group) from its profile webpage.""" types = { 'committee': { 'url': 'http://www.nrsr.sk/web/Default.aspx?sid=vybory/vybor&ID=', 'members_xpath': './/table[@class="tab_zoznam"]//tr', 'name_xpath': 'td[1]/a/strong', }, 'parliamentary group': { 'url': 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/kluby/klub&ID=', 'members_xpath': './/table[@class="tab_zoznam"]//tr', 'name_xpath': 'td[1]/a/strong', }, 'delegation': { 'url': 'http://www.nrsr.sk/web/Default.aspx?sid=eu/delegacie/delegacia&ID=', 'members_xpath': './/table[@class="tab_details"]//tr', 'name_xpath': 'td[1]/strong/a', }, 'friendship group': { 'url': 'http://www.nrsr.sk/web/Default.aspx?sid=eu/sp/sp&SkupinaId=', 'members_xpath': './/table[@class="tab_details"]//tr', 'name_xpath': 'td[1]/strong/a', }, } if type not in types: raise ValueError("unknown type of group '%s'" % type) url = types[type]['url'] + str(id) content = scrapeutils.download(url) if 'Unexpected error!' in content: raise RuntimeError("group of type '%s' with id '%s' not found") content = content.replace('member_vez', 'member') # exception in committee with id=119 html = lxml.html.fromstring(content) result = { 'id': str(id), 'url': url } result['názov'] = html.findtext('.//h1') podnadpis = html.find('.//h2/span') if podnadpis is not None: p = podnadpis.text_content() if p not in ('', 'Zoznam členov'): result['podnadpis'] = p opis = html.find('.//span[@id="_sectionLayoutContainer_ctl01__basicInfoText"]') if opis is not None: result['opis'] = lxml.html.tostring(opis, encoding='unicode', with_tail=False) # current term and older terms are displayed differently if 'Zoznam členov' in content: # scraping current term - contact information and member cards for tr in html.findall('.//table[@class="tab_details"]//tr'): label = tr.findtext('td[1]/span') if not label: continue value = tr.find('td[2]/span') result[label.lower().rstrip('.:')] = value.text_content() if value is not None else '' other_docs = html.find('.//a[@id="_sectionLayoutContainer_ctl01__otherDocumentsLink"]') if other_docs is not None: result['ďalšie dokumenty'] = other_docs.get('href') result['členovia'] = [] for div in html.findall('.//div[@class="member"]'): member = { 'id': re.search(r'PoslanecID=(\d+)', div.find('.//a').get('href')).group(1), 'fotka': 'http://www.nrsr.sk/web/' + div.find('.//img').get('src'), 'meno': div.findtext('.//a/strong'), 'obdobia': [{'rola': div.findtext('.//span[1]').lower()}], } if type != 'parliamentary group': member['klub'] = div.findtext('.//em')[1:-1] if member['klub'] in ('-', 'nie je členom poslaneckého klubu'): member['klub'] = None elif not member['klub'].startswith('Klub '): member['klub'] = 'Klub ' + member['klub'] result['členovia'].append(member) else: # scraping older terms - list of members with membership roles and durations result['členovia'] = [] for i, tr in enumerate(html.findall(types[type]['members_xpath'])): if type in ('parliamentary group', 'committee') and i < 2: continue member = { 'id': re.search(r'PoslanecID=(\d+)', tr.find('td[1]//a').get('href')).group(1), 'meno': tr.findtext(types[type]['name_xpath']), 'obdobia': [], } for period in tr.findtext('td[2]').split(', '): membership = re.search(r'([^\(]*)\((.+?) - (.+?)\)', period, re.DOTALL) if membership: member['obdobia'].append({ 'rola': membership.group(1), 'od': membership.group(2), 'do': membership.group(3), }) result['členovia'].append(member) return scrapeutils.plaintext(result, ['opis'])
# scrape current representatives from praha.eu and save them in temp datafiles import scrapeutils from lxml import html, etree import csv import re outfile = open('tempdata/current_people.csv', 'w') outwriter = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC) url = "http://www.praha.eu/jnp/cz/o_meste/primator_a_volene_organy/zastupitelstvo/seznam_zastupitelu/index.html?size=100" domtree = html.fromstring(scrapeutils.download(url)) trs = domtree.xpath('//tbody/tr') # for each person for tr in trs: row = [] tds = tr.xpath('td') row.append(re.search('memberId=(\d{1,})',tds[0].xpath('a/@href')[0]).group(1).strip()) row.append(tds[0].xpath('a')[0].text.strip()) try: row.append(tds[1].text.strip()) except: row.append('') row.append(tds[2].xpath('a')[0].text.strip()) outwriter.writerow(row) outfile.close()
def group_list(type, term=None): """Parse list of groups of a given type (committee, parliamentary group, delegation, friendship group).""" types = { 'committee': { 'url': 'http://www.nrsr.sk/web/default.aspx?SectionId=77', 'term_param_name': '_sectionLayoutContainer$ctl02$_currentTerm', }, 'parliamentary group': { 'url': 'http://www.nrsr.sk/web/default.aspx?SectionId=69', 'term_param_name': '_sectionLayoutContainer$ctl02$_currentTerm', }, 'delegation': { 'url': 'http://www.nrsr.sk/web/default.aspx?sid=eu/delegacie/zoznam', 'term_param_name': '_sectionLayoutContainer$ctl01$_currentTerm', }, 'friendship group': { 'url': 'http://www.nrsr.sk/web/default.aspx?sid=eu/sp/zoznam', 'term_param_name': '_sectionLayoutContainer$ctl01$_currentTerm', }, } if type not in types: raise ValueError("unknown type of group '%s'" % type) if term and term not in terms.keys(): raise ValueError("unknown term '%s'" % term) content = scrapeutils.download(types[type]['url']) html = lxml.html.fromstring(content) # scraping for older terms requires another POST request to emulate selectbox choice if term: data = { types[type]['term_param_name']: term, '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'), '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'), } ext = '|%s' % term content = scrapeutils.download(types[type]['url'], 'POST', data, ext) html = lxml.html.fromstring(content) # pick list items result = { 'url': types[type]['url'], '_items': [] } for li in html.findall('.//ul[@class="longlist"]//li'): a = li.find('a') group = { 'id': re.search(r'(ID|SkupinaId)=(\d+)', a.get('href')).group(2), 'názov': a.text, } line = li.text_content() info = re.search(group['názov'] + r'\s*(\((.+?) - (.+?)\))?\s*(\S.*)?$', line, re.DOTALL) if info: if info.group(2): group['od'] = info.group(2) group['do'] = info.group(3) if info.group(4): group['poznámka'] = info.group(4) result['_items'].append(group) return scrapeutils.plaintext(result)
} return name # get all senators by districts and all political groups baseurl = 'http://senat.cz/' people = [] groups = {} iid = 100001 for i in range(1, 82): print(i) url = "http://senat.cz/senat/volby/hledani/o_obvodu.php?ke_dni=23.02.2015&O=10&kod=" + str( i) domtree = html.fromstring( bytes(scrapeutils.download(url), 'iso-8859-1').decode('utf-8') ) #note: senat.cz incorrectly send headers as iso/latin1 and so requests save it incorrectly - so fixing it here tables = domtree.xpath('//table[@class="tHistory"]') for table in tables: image = baseurl[:-1] + table.xpath('tr/td/img/@src')[0].replace( '_110', '') name = full_name2name(table.xpath('tr/td/img/@alt')[0]) ident = re.search('par_3=(\d{1,})', table.xpath('tr/td/a/@href')[0]).group(1).strip() people.append({ 'given_name': name['given_name'], 'family_name': name['family_name'], 'name': name['name'],