forces_list = []
mails_list = []
url_list = []


# TODO hardcoded value: id of the first mp from the current assembly
indices = map(int, open("data/IDs_MPs").readlines())
cur.execute("""SELECT original_url FROM mps""")
urls_already_in_db = set(zip(*cur.fetchall())[0])
for i in range(835, max(indices) + 1):
    original_url = unicode("http://www.parliament.bg/bg/MP/%d" % i)
    if original_url in urls_already_in_db:
        continue
    logger_mps.info("Parsing data for MP id %s" % i)
    xml_file = unicode("http://www.parliament.bg/export.php/bg/xml/MP/%d" % i)
    xml_str = urlopen(xml_file).read()
    try:
        r = xmltodict.parse(xml_str)
        name = (
            " ".join(
                [
                    r["schema"]["Profile"]["Names"]["FirstName"]["@value"],
                    r["schema"]["Profile"]["Names"]["SirName"]["@value"],
                    r["schema"]["Profile"]["Names"]["FamilyName"]["@value"],
                ]
            )
            .encode("UTF-8")
            .upper()
            .strip()
        )
        force = " ".join(r["schema"]["Profile"]["PoliticalForce"]["@value"].split(" ")[:-1])
logger_to_db = logging.getLogger('to_db')


cur.execute("""SELECT original_url FROM stenograms""")
urls_already_in_db = set(_[0] for _ in cur.fetchall())
stenogram_IDs = [(i, u'http://www.parliament.bg/bg/plenaryst/ns/7/ID/'+i)
                 for i in map(str.strip, open('data/IDs_plenary_stenograms_41').readlines())]
stenogram_IDs += [(i, u'http://www.parliament.bg/bg/plenaryst/ns/50/ID/'+i)
                  for i in map(str.strip, open('data/IDs_plenary_stenograms_42').readlines())]
for i, (ID, original_url) in enumerate(stenogram_IDs[-5:]):
    problem_by_name = False
    problem_by_party = False
    logger_to_db.info("Parsing stenogram %s - %d of %d." % (ID, i+1, len(stenogram_IDs)))

    try:
        f = urlopen(original_url)
        complete_stenogram_page = f.read().decode('utf-8')
        parser = StenogramsHTMLParser(complete_stenogram_page)
        date_string = parser.date.strftime('%d%m%y')
    except Exception as e:
        logger_to_db.error("Parsing problem with ID %s. %s"%(ID,str(e)))
        continue


    try:
        filename = re.search(r"/pub/StenD/(\d*iv%s.xls)" % date_string, complete_stenogram_page).groups()[0]
        by_name_web = urlopen("http://www.parliament.bg/pub/StenD/%s" % filename)
        by_name_temp = open('/tmp/temp.excel', 'wb')
        by_name_temp.write(by_name_web.read())
        by_name_temp.close()
        if ID == '2766': # XXX Workaround malformated excel file.
#u'обсъждане(зала първо четене)':                       'proposed_1st', see signature 002-02-50
}


##############################################################################
# Gather bills.
##############################################################################
logger_html_bills = logging.getLogger('html_parser_bills')

origurlcur = db.cursor()
origurlcur.execute("""SELECT original_url FROM bills""")
urls_already_in_db = set(u[0] for u in origurlcur)

logger_html_bills.info('Opening calendar.')
base_url = 'http://www.parliament.bg'
parser_calendar = bs4.BeautifulSoup(urlopen(base_url + '/bg/bills/').read())
for month in parser_calendar.find('div', id='calendar').find_all('a'):
    href = month.get('href')
    y,m = map(int, href.split('/')[-1].split('-'))
    if y<2009 or (y==2009 and m<7): continue # XXX hardcoded check (only last parliament)
    logger_html_bills.info('Opening calendar %d %d.'%(y, m))
    month_page = bs4.BeautifulSoup(urlopen(base_url + href).read())
    for a in month_page.find('div', id='monthview').find_all('a'):
        original_url = base_url + a.get('href')
        if original_url in urls_already_in_db:
            continue
        bill_page = bs4.BeautifulSoup(urlopen(original_url).read())
        table = bill_page.find('table', class_='bills')

        name = table.find_all('tr')[0].find('strong').string.split(u'Законопроект за')[-1].strip()
        sig = table.find_all('tr')[1].find_all('td')[1].string.strip()