import datetime import re import bs4 import xlrd from pk_db import db, cur from pk_logging import logging, logger_workaround from pk_namedtuples import * from pk_tools import urlopen, canonical_party_name ############################################################################## # HTML Parsing ############################################################################## logger_html = logging.getLogger('html_parser') class StenogramsHTMLParser(bs4.BeautifulSoup): def __init__(self, text): super(StenogramsHTMLParser, self).__init__(text) self.date = datetime.datetime.strptime(self.find('div', class_='dateclass').string.strip(), '%d/%m/%Y') self.data_list = list(self.find('div', class_='markcontent').stripped_strings) self.votes_indices = [] how_many_have_voted_marker = u'Гласувал[и]?[ ]*\d*[ ]*народни[ ]*представители:' # The above marker regex must permit a number of spelling errors that can be present in the stenograms. for i, l in enumerate(self.data_list): if re.search(how_many_have_voted_marker, l):
return i raise ValueError ############################################################################## # Load templates. ############################################################################## templates = TemplateLookup( directories=["mako_templates"], input_encoding="utf-8", output_encoding="utf-8", strict_undefined=True ) ############################################################################## # Prepare loggers. ############################################################################## logger_html = logging.getLogger("static_html_gen") ############################################################################## # Set up sitemap. ############################################################################## class Sitemap(object): def __init__(self): self.base_string = '<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n%s\n</urlset>' self.url_string = "<url><loc>http://www.parlamentaren-kontrol.com/%s</loc><priority>%0.1f</priority>%s</url>" self.image_string = "<image:image><image:loc>http://www.parlamentaren-kontrol.com/%s</image:loc><image:caption>%s</image:caption></image:image>" self.content_tuples = [] def add(self, loc, priority, images=[]): self.content_tuples.append((loc, priority, images))
# -*- coding: utf-8 -*- import xml import xmltodict from pk_db import db, cur from pk_logging import logging from pk_tools import urlopen, canonical_party_name logger_mps = logging.getLogger("mps_data") names_list = [] forces_list = [] mails_list = [] url_list = [] # TODO hardcoded value: id of the first mp from the current assembly indices = map(int, open("data/IDs_MPs").readlines()) cur.execute("""SELECT original_url FROM mps""") urls_already_in_db = set(zip(*cur.fetchall())[0]) for i in range(835, max(indices) + 1): original_url = unicode("http://www.parliament.bg/bg/MP/%d" % i) if original_url in urls_already_in_db: continue logger_mps.info("Parsing data for MP id %s" % i) xml_file = unicode("http://www.parliament.bg/export.php/bg/xml/MP/%d" % i) xml_str = urlopen(xml_file).read() try:
u'отхвърлен(зала второ четене)': 'rejected_2nd', u'оттеглен от вносителя(оттеглен)': 'retracted', u'наложено вето(вето президент)': 'vetoed', u'внесен(преразглеждане зала (след вето))': 'proposed_after_veto', u'повторно приемане(преразглеждане зала (след вето))': 'accepted_after_veto', # TODO the next few are unclear in their definition (raise a warning) u'оспорени текстове(преразглеждане зала (след вето))': 'challenged_after_veto', u'оспорен по принцип(преразглеждане зала (след вето))':'challenged_after_veto', #u'обсъждане(зала първо четене)': 'proposed_1st', see signature 002-02-50 } ############################################################################## # Gather bills. ############################################################################## logger_html_bills = logging.getLogger('html_parser_bills') origurlcur = db.cursor() origurlcur.execute("""SELECT original_url FROM bills""") urls_already_in_db = set(u[0] for u in origurlcur) logger_html_bills.info('Opening calendar.') base_url = 'http://www.parliament.bg' parser_calendar = bs4.BeautifulSoup(urlopen(base_url + '/bg/bills/').read()) for month in parser_calendar.find('div', id='calendar').find_all('a'): href = month.get('href') y,m = map(int, href.split('/')[-1].split('-')) if y<2009 or (y==2009 and m<7): continue # XXX hardcoded check (only last parliament) logger_html_bills.info('Opening calendar %d %d.'%(y, m)) month_page = bs4.BeautifulSoup(urlopen(base_url + href).read()) for a in month_page.find('div', id='monthview').find_all('a'):