no = int(sheet.cell_value(rowx=row, colx=2)) abstained = int(sheet.cell_value(rowx=row, colx=3)) total = int(sheet.cell_value(rowx=row, colx=4)) votes_by_party_dict[party] = vote_stats_per_party_tuple(yes, no, abstained, total) sessions.append(session_tuple(description, time, None, votes_by_party_dict)) row += 1 return reg_by_party_dict, sessions ############################################################################## # Parse and save to disc. ############################################################################## logger_to_db = logging.getLogger('to_db') cur.execute("""SELECT original_url FROM stenograms""") urls_already_in_db = set(_[0] for _ in cur.fetchall()) stenogram_IDs = [(i, u'http://www.parliament.bg/bg/plenaryst/ns/7/ID/'+i) for i in map(str.strip, open('data/IDs_plenary_stenograms_41').readlines())] stenogram_IDs += [(i, u'http://www.parliament.bg/bg/plenaryst/ns/50/ID/'+i) for i in map(str.strip, open('data/IDs_plenary_stenograms_42').readlines())] for i, (ID, original_url) in enumerate(stenogram_IDs[-5:]): problem_by_name = False problem_by_party = False logger_to_db.info("Parsing stenogram %s - %d of %d." % (ID, i+1, len(stenogram_IDs))) try: f = urlopen(original_url) complete_stenogram_page = f.read().decode('utf-8') parser = StenogramsHTMLParser(complete_stenogram_page) date_string = parser.date.strftime('%d%m%y')
from pk_logging import logging from pk_tools import urlopen, canonical_party_name logger_mps = logging.getLogger("mps_data") names_list = [] forces_list = [] mails_list = [] url_list = [] # TODO hardcoded value: id of the first mp from the current assembly indices = map(int, open("data/IDs_MPs").readlines()) cur.execute("""SELECT original_url FROM mps""") urls_already_in_db = set(zip(*cur.fetchall())[0]) for i in range(835, max(indices) + 1): original_url = unicode("http://www.parliament.bg/bg/MP/%d" % i) if original_url in urls_already_in_db: continue logger_mps.info("Parsing data for MP id %s" % i) xml_file = unicode("http://www.parliament.bg/export.php/bg/xml/MP/%d" % i) xml_str = urlopen(xml_file).read() try: r = xmltodict.parse(xml_str) name = ( " ".join( [ r["schema"]["Profile"]["Names"]["FirstName"]["@value"], r["schema"]["Profile"]["Names"]["SirName"]["@value"],