def import_governingparties(self): """ This requires that governments & parties have already been imported """ path = os.path.dirname(os.path.realpath(__file__)) f = open(os.path.join(path, self.GPS_FILENAME)) for line in f.readlines(): line = line.strip().decode('utf8') if not line or line[0] == '#': continue (party, government, begin, end) = line.split('\t') try: party = Party.objects.get(abbreviation=party) except Party.DoesNotExist: raise ParseError( 'Invalid party %s in initial governing party data' % party) try: government = Government.objects.get(name=government) except Government.DoesNotExist: raise ParseError( 'Invalid government %s in initial governing party data' % government) try: gp = GoverningParty.objects.get(party=party, government=government) if not self.replace: continue except GoverningParty.DoesNotExist: gp = GoverningParty(party=party, government=government) gp.begin = begin if end == "None": gp.end = None else: gp.end = end self.logger.info("importing governing party %s / %s - %s" % (gp.party, gp.begin, gp.end)) gp.save()
def get_field_el(doc, field): # Get "doclist-items" elements listed by table headers (th) el_list = doc.xpath( '//div[@class="doclist-items"]/div[@class="listborder"]/table//th') for el in el_list: s = el.text.split(':')[0].strip() if s == FIELD_MAP[field]: # td follows th, so positional selection can be used td = el.getnext() if td.tag != 'td': raise ParseError('expecting a td element') return td return None
def _import_one(self, vote_id): (year, plsess, nr) = vote_id.split('/') url = self.URL_BASE + self.VOTE_URL % (int(year), plsess, int(nr)) el_list, next_link = self.read_listing(self.CACHE_DIR, url) if len(el_list) != 1: raise ParseError("vote with id %s not found" % vote_id, url=url) el = el_list[0] vote_id_str = "%s/%s/%s" % (plsess, year, nr) got_id = "%s/%d" % (el['plsess'], el['number']) if vote_id_str != got_id: raise ParseError("invalid vote returned (wanted %s, got %s)" % (vote_id_str, got_id), url=url) info = {'plsess': el['plsess'], 'number': el['number']} info['link'] = el['results_link'] try: plv = self.import_session(info) except ParseError as e: e.url = url raise db.reset_queries() return plv
def import_sgml_doc(self, info, current_version): url = DOC_DL_URL % (info['type'], info['id']) if not current_version: current_version = '0' xml_fn = self.download_sgml_doc(info, url, current_version=current_version) if not xml_fn: return None f = open(xml_fn, 'r') root = html.fromstring(f.read()) f.close() el_list = root.xpath('.//ident/nimike') assert len(el_list) >= 1 el = el_list[0] text = self.clean_text(el.text) self.logger.info('%s %s: %s' % (info['type'], info['id'], text)) info['subject'] = text if info['type'] == 'KK': self.parse_question_text(root, info) else: if info['type'].endswith('VM'): xpath_list = ('.//asianvir', './/emasianv') else: xpath_list = ('.//peruste', './/paasis', './/yleisper') for xpath in xpath_list: l = root.xpath(xpath) if not len(l): continue assert len(l) == 1 target_el = l[0] break else: raise ParseError('Summary section not found') info['summary'] = self.parse_te_paragraphs(target_el) if info['type'] in ('KK', 'LA'): if info['type'] == 'KK': author_root = root.xpath(".//kysosa[@kieli='suomi']") assert len(author_root) == 1 author_root = author_root[0] else: author_root = root info['author'] = self.parse_author(author_root) info['signatures'] = self.parse_signatures(author_root) return info
def import_members(self, **args): if not MemberActivityType.objects.count(): import_activity_types() self.logger.info("%d activity types imported" % MemberActivityType.objects.count()) self.logger.debug("fetching MP list") if args.get('full', False): date_str = '24.03.1999' else: term = Term.objects.latest() date_str = term.begin.strftime('%d.%m.%Y') list_url = self.URL_BASE + self.LIST_URL % date_str s = self.open_url(list_url, 'member') doc = html.fromstring(s) doc.make_links_absolute(list_url) link_list = doc.xpath("//a[@target='edus_main']") for l in link_list: name = l.text.strip().replace(' ', '') url = l.attrib['href'] if 'single' in args and not args['single'].lower() in name.lower(): continue self.logger.debug("fetching MP %s" % name) name = re.sub(r'\s*\([\w\d. ,]+\)\s*', '', name) last_name, given_names = name.split(',') given_names = given_names.strip() last_name = last_name.strip() try: Member.objects.get(surname=last_name, given_names=given_names) if not self.replace: continue except Member.DoesNotExist: pass s = self.open_url(url, 'member') doc = html.fromstring(s) el = doc.xpath("//frame[@name='vasen2']") if len(el) != 1: raise ParseError("Invalid MP info frame") s = el[0].attrib['src'] m = re.search(r'hnro=(\d+)', s) if not m: raise ParseError("MP ID not found") mp_id = int(m.groups()[0]) # FIXME: New MPs that replace the euro-MEPs -- Remove this later if mp_id in (1275, 1276, 1277): if datetime.now() < datetime(year=2014, month=7, day=4): continue mp_info = self.fetch_member(mp_id) if not mp_info: continue if 'dry_run' in args and not args['dry_run']: self.save_member(mp_info) elif 'dry_run' in args and args['dry_run']: pprint.pprint(mp_info) self.logger.info('Imported {0} MPs'.format(len(link_list))) self.logger.info( "Adding Carl Haglund as a pseudo-MP for purposes of minister counting" ) haglund_info = { 'birthdate': '1979-03-29', 'birthplace': 'Espoo', 'email': '*****@*****.**', 'given_names': 'Carl Christoffer', 'home_county': 'Espoo', 'id': "nonmp_0001", 'info_url': 'http://valtioneuvosto.fi/hallitus/jasenet/puolustusministeri/fi.jsp', 'name': 'Carl Haglund', # Following two are a minimal hack to make Carl only show up in the # minister calculations. See save_member 'party': 'r', 'parties': {}, 'phone': '09 1608 8284', 'portrait': 'http://valtioneuvosto.fi/documents/10184/143444/Carl+Haglund/694456f8-8ce7-453a-bf8d-161c4a4d01ca?t=1404465254000&width=500', 'districts': {}, 'posts': [{ 'begin': datetime(year=2012, month=7, day=5).date(), 'end': None, 'label': 'puolustusministeri', 'role': 'minister' }], 'surname': 'Haglund' } self.save_member(haglund_info)
def fetch_member(self, mp_id): url = self.URL_BASE + self.MP_INFO_URL % mp_id s = self.open_url(url, 'member') doc = html.fromstring(s) doc.make_links_absolute(url) mp_info = {'id': mp_id, 'info_url': url} name_el = doc.xpath('//div[@id="content"]/div[@class="header"]/h1') if len(name_el) != 1: raise ParseError("MP name not found") name, pg = name_el[0].text.strip().split('/') name = name.strip() pg = pg.strip() names = name.split() surname, first_names = names[-1], ' '.join(names[0:-1]) mp_info['name'] = "%s %s" % (surname, first_names) name_el = get_field_el(doc, 'name') if name_el == None: self.logger.warning("MP info element not found") return None surname, given_names = name_el.text.strip().split(', ') if '(' in given_names: given_names = given_names.split('(')[0].strip() if not given_names and surname == 'Saarikangas': given_names = 'Martin' mp_info['surname'] = surname mp_info['given_names'] = given_names.strip() td = get_field_el(doc, 'phone') if td is not None: mp_info['phone'] = td.text.strip() td = get_field_el(doc, 'email') if td is not None: mp_info['email'] = td.text_content().strip().replace('[at]', '@') td = get_field_el(doc, 'birth') text = td.text.strip() # First try to match the birthplace, too m = re.match(self.DATE_MATCH + r'\s+(\w+)', text, re.U) if not m: m = re.match(self.DATE_MATCH, text, re.U) if not m: raise ParseError("Invalid MP birth date") (day, mon, year) = m.groups()[0:3] mp_info['birthdate'] = '-'.join((year, mon, day)) if len(m.groups()) == 4: mp_info['birthplace'] = m.groups()[3] # Electorate td = get_field_el(doc, 'home_county') if td is not None: mp_info['home_county'] = td.text.strip() td = get_field_el(doc, 'districts') el_list = td.xpath('ul/li') da_list = [] for el in el_list: district, date_range = el.text.strip().split(' ') dates = date_range.split(' - ') da = {'district': district, 'begin': self.convert_date(dates[0])} if len(dates) > 1: da['end'] = self.convert_date(dates[1]) da_list.append(da) mp_info['districts'] = da_list # Party memberships td = get_field_el(doc, 'parties') el_list = td.xpath('ul/li') pa_list = [] for el in el_list: a_el = el.xpath('a') if not a_el: # Strip text within parentheses m = re.match(r'([^\(]*)\([^\),]+\)(.+)', el.text) if m: text = ' '.join(m.groups()) else: text = el.text m = re.match(r'(\D+)\s+([\d\.,\s\-]+)$', text.strip()) party, date_ranges = (m.groups()[0], m.groups()[1]) if not a_el: # Strip text within parentheses m = re.match(r'([^\(]*)\([^\),]+\)(.+)', el.text) if m: text = ' '.join(m.groups()) else: text = el.text m = re.match(r'(\D+)\s+([\d\.,\s\-]+)$', text.strip()) party, date_ranges = (m.groups()[0], m.groups()[1]) else: a_el = a_el[0] party, date_ranges = (a_el.text.strip(), a_el.tail.strip()) # Strip text within parentheses m = re.match(r'([^\(]*)\([^\)]+\)(.+)', date_ranges) if m: date_ranges = ' '.join(m.groups()) for dr in date_ranges.split(','): pa = {'party': party} dates = dr.strip().split(' - ') pa['begin'] = self.convert_date(dates[0]) if len(dates) > 1: pa['end'] = self.convert_date(dates[1]) pa_list.append(pa) mp_info['parties'] = pa_list img_el = doc.xpath('//div[@id="submenu"]//img[@class="portrait"]') mp_info['portrait'] = img_el[0].attrib['src'] # Committee memberships mp_info['posts'] = self.resolve_memberships(doc) mp_info['gender'] = figure_mp_gender(name) return mp_info
def handle_processing_stages(self, info, html_doc): doc_name = "%s %s" % (info['type'], info['id']) status_map = {1: 'upcoming', 2: 'in_progress', 3: 'finished'} names = { 'vireil': ('intro', ('Annettu eduskunnalle', 'Aloite jätetty')), 'lahete': ('debate', 'Lähetekeskustelu'), 'valiok': ('committee', 'Valiokuntakäsittely'), 'poydal': ('agenda', 'Valiokunnan mietinnön pöydällepano'), '1kasit': ('1stread', 'Ensimmäinen käsittely'), '2kasit': ('2ndread', 'Toinen käsittely'), '3kasit': ('3ndread', 'Kolmas käsittely'), 'paat': ('finished', None), 'akasit': ('onlyread', 'Ainoa käsittely'), 'akja2k': ('only2read', 'Ainoa ja toinen käsittely'), '3kjaak': ('only3read', 'Kolmas ja ainoa käsittely'), 'peru': ('cancelled', 'Ilmoitus peruuttamisesta'), 'rauennut': ('lapsed', None), 'raue': ('lapsed', None), 'jatlep': ('suspended', None), } finishing_phases = ('3ndread', '2ndread', 'onlyread', 'only2read', 'only3read', 'cancelled', 'lapsed', 'lapsed') img_els = html_doc.xpath("//div[@id='vepsasia-kasittely']/img") assert len(img_els) phases = [] for img in img_els: s = img.attrib['src'] m = re.match('/thwfakta/yht/kuvat/palkki/ve([a-z0-9]+)_(\d)\.gif', s) phase = m.groups()[0] status = int(m.groups()[1]) status = status_map[status] if not phase in names: raise ParseError("unknown processing phase %s" % phase) l = names[phase] phase = l[0] phases.append((phase, status, l[1])) last_phase = phases[-1][0] phase_list = [] for idx, (phase, status, el_name) in enumerate(phases): if not el_name or status not in ('in_progress', 'finished'): continue box_el_list = html_doc.xpath("//div[@class='listborder']//h2") # quirks if doc_name in ('HE 25/2009', 'HE 57/2014', 'HE 29/2014') and phase == '2ndread': el_name = names['akja2k'][1] if doc_name in ('HE 29/2014', 'HE 3/2014', 'HE 215/2013', 'HE 203/2013', 'HE 288/2014', 'HE 297/2014') and phase == 'only2read': phase = '2ndread' el_name = names['2kasit'][1] if doc_name == 'HE 112/2011': if phase == '2ndread': continue if phase == '1stread': phase = 'onlyread' el_name = names['akasit'][1] finishing_phase = None for box_el in box_el_list: s = box_el.text_content().strip().strip('.') if isinstance(el_name, tuple): if s not in el_name: continue else: if el_name != s: continue parent_el = box_el.getparent().getparent() break else: if phase == 'committee' and last_phase in ('cancelled', 'lapsed'): continue self.logger.warning("processing stage '%s' not found" % el_name) continue phase_info = {} if phase == 'committee': el_list = parent_el.xpath(".//div[contains(., 'Valmistunut')]") date_list = [] for date_el in el_list: date = date_el.tail.strip() (d, m, y) = date.split('.') date = '-'.join((y, m, d)) date_list.append(date) if not date_list and last_phase in ('cancelled', 'lapsed'): continue if not date_list: self.logger.warning("date not found for committee phase") continue date = max(date_list) else: date_el = parent_el.xpath(".//div[.='Pvm']") assert len(date_el) == 1 arr = date_el[0].getparent().text_content().strip().split() assert len(arr) >= 2 (d, m, y) = arr[-1].split('.') date = '-'.join((y, m, d)) min_el = parent_el.xpath(".//div[.='Istuntopöytäkirja']") if min_el and phase != 'cancelled': links = min_el[0].getparent().xpath('a') assert len(links) >= 1 plsess_list = [] for l in links: href = l.attrib['href'] m = re.search(r'\{KEY\}=PTK\+(\d+/\d{4})', href) assert m, 'Plenary session id not found (phase %s)' % phase plsess_id = m.groups()[0] m = re.search(r'\{KNRO\}=(\d+)', href) assert m, 'Plenary session item number not found (phase %s)' % phase plitem_nr = m.groups()[0] plsess_list.append({ 'plsess': plsess_id, 'index': plitem_nr }) phase_info['plsess_items'] = plsess_list if phase == 'finished': finishing_phase = idx phase_info.update({'index': idx, 'phase': phase, 'date': date}) phase_list.append(phase_info) #print "%s: %s" % (phase, date) if not finishing_phase: for p in phases: if p[0] == 'finished' and p[1] != 'upcoming': is_finished = True break else: is_finished = False if is_finished: for p in phase_list: if p['phase'] in finishing_phases: finishing_phase = p break assert finishing_phase, 'Finishing phase not found' idx = finishing_phase['index'] + 1 max_idx = max([x['index'] for x in phase_list]) assert max_idx < idx phase_list.append({ 'index': idx, 'phase': 'finished', 'date': finishing_phase['date'] }) info['phases'] = phase_list
def import_doc(self, info): url = DOC_DL_URL % (info['type'], info['id']) info['info_link'] = url self.fix_id_quirks(info) if not should_download_doc(info): self.logger.warning("skipping %s %s" % (info['type'], info['id'])) return None origin_id = "%s %s" % (info['type'], info['id']) try: doc = Document.objects.get(origin_id=origin_id) except Document.DoesNotExist: doc = Document(origin_id=origin_id) if 'update_time' in info: doc.mark_checked() if doc.last_modified_time and doc.last_modified_time >= info[ 'update_time'] and not self.replace: self.logger.debug("%s %s not updated" % (info['type'], info['id'])) doc.save(update_fields=['last_checked_time']) return None else: self.logger.debug( "%s %s updated %s (checked %s)" % (info['type'], info['id'], info['update_time'], doc.last_modified_time)) else: if doc.pk and not self.replace: return doc doc.type = DOC_TYPES[info['type']] doc.name = origin_id info = self.fetch_processing_info(info) if info['type'] == 'HE': self.import_he(info) else: ret = self.import_sgml_doc(info, current_version=doc.version) if not ret: return None doc.version = info.get('doc_version', None) doc.subject = info['subject'] for attr_name in ('summary', 'question', 'answer', 'answerer_name', 'answerer_title'): if attr_name in info: setattr(doc, attr_name, info[attr_name]) if 'error' in info: doc.error = info['error'] else: doc.error = None # Figure out the document date through the intro stage. for st in info['phases']: if st['phase'] == 'intro': doc.date = st['date'] break if doc.date is None: raise ParseError("Document date could not be determined") doc.info_link = info['info_link'] if 'sgml_link' in info: doc.sgml_link = info['sgml_link'] if 'author' in info: doc.author = Member.objects.get(origin_id=info['author']['id']) doc.mark_modified() doc.save() self.save_stages(doc, info) self.save_keywords(doc, info) if 'signatures' in info: self.save_signatures(doc, info) # The keywords are saved only at this point. We'll save it again in order # to create the proper KeywordActivity objects. doc._updated = True doc.save() return doc
def import_session(self, info): if not info['plsess'] in self.plsess_by_id: try: plsess = PlenarySession.objects.get(origin_id=info['plsess']) except PlenarySession.DoesNotExist: raise Exception( "Vote %s refers to unexisting plenary session" % (info['number'], info['plsess'])) self.plsess_by_id[info['plsess']] = plsess plsess = self.plsess_by_id[info['plsess']] try: pv = PlenaryVote.objects.get(plsess=plsess, number=info['number']) if not self.replace: return except PlenaryVote.DoesNotExist: pv = PlenaryVote(plsess=plsess, number=info['number']) self.logger.info('processing plenary vote %s/%d' % (plsess.name, info['number'])) s = self.open_url(info['link'], self.CACHE_DIR) doc = html.fromstring(s) hdr_el = doc.xpath('//table[@class="voteResults"]') if len(hdr_el) < 1: raise ParseError('vote header not found') hdr_el = hdr_el[0] s = self.clean_text(hdr_el.xpath('caption')[0].text) m = re.match(r'Äänestys (\d+) klo (\d{2}\.\d{2})', s, re.U) info['time'] = m.groups()[1] el = hdr_el.xpath('tbody/tr')[0].xpath('td')[1] s = self.clean_text(el.text) info['subject'] = s el = hdr_el.xpath('tbody/tr/td/strong')[0] s = self.clean_text(el.text) step = PROCESSING_STEP[s] el = doc.xpath("//th[contains(., 'nestysasettelu')]")[0] s = self.clean_text(el.getnext().text) info['setting'] = s vote_list_el = doc.xpath('//table[@class="statistics"]/tbody/tr') if len(vote_list_el) < 196 / 2 or len(vote_list_el) > 200 / 2: raise ParseError('vote list not found') votes = [] for row_el in vote_list_el: td_list = row_el.xpath('td') if len(td_list) != 5: raise ParseError('invalid vote row') votes.append(parse_vote(td_list[0].text, td_list[1].text)) if td_list[3].text: votes.append(parse_vote(td_list[3].text, td_list[4].text)) info['votes'] = votes pv.mark_modified() pv.mark_checked() self.updated += 1 return self.save_session(pv, info)