def get_session(self, session_url=None, session_id=None): """ Load session details for the given detail page URL or numeric ID """ # Read either session_id or session_url from the opposite if session_id is not None: session_url = self.urls['SESSION_DETAIL_PRINT_PATTERN'] % session_id elif session_url is not None: parsed = parse.search(self.urls['SESSION_DETAIL_PARSE_PATTERN'], session_url) session_id = parsed['session_id'] logging.info("Getting session %d from %s", session_id, session_url) session = Session(numeric_id=session_id) time.sleep(self.config.WAIT_TIME) response = self.user_agent.open(session_url) # forms for later attachment download mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False) # seek(0) is necessary to reset response pointer. response.seek(0) html = response.read() html = html.replace(' ', ' ') parser = etree.HTMLParser() dom = etree.parse(StringIO(html), parser) # check for page errors try: page_title = dom.xpath('//h1')[0].text if 'Fehlermeldung' in page_title: logging.info("Page %s cannot be accessed due to server error", session_url) if self.options.verbose: print "Page %s cannot be accessed due to server error" % session_url return if 'Berechtigungsfehler' in page_title: logging.info("Page %s cannot be accessed due to permissions", session_url) if self.options.verbose: print "Page %s cannot be accessed due to permissions" % session_url return except: pass try: error_h3 = dom.xpath('//h3[@class="smc_h3"]')[0].text.strip() if 'Keine Daten gefunden' in error_h3: logging.info("Page %s does not contain any agenda items", session_url) if self.options.verbose: print "Page %s does not contain agenda items" % session_url return except: pass session.original_url = session_url # Session title try: session.title = dom.xpath( self.xpath['SESSION_DETAIL_TITLE'])[0].text except: logging.critical( 'Cannot find session title element using XPath SESSION_DETAIL_TITLE' ) raise TemplateError( 'Cannot find session title element using XPath SESSION_DETAIL_TITLE' ) # Committe link try: links = dom.xpath(self.xpath['SESSION_DETAIL_COMMITTEE_LINK']) for link in links: href = link.get('href') parsed = parse.search( self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href) if parsed is not None: session.committee_id = parsed['committee_id'] except: logging.critical( 'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH' ) raise TemplateError( 'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH' ) # Session identifier, date, address etc tds = dom.xpath(self.xpath['SESSION_DETAIL_IDENTIFIER_TD']) if len(tds) == 0: logging.critical( 'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH' ) raise TemplateError( 'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH' ) else: for n in range(0, len(tds)): try: tdcontent = tds[n].text.strip() nextcontent = tds[n + 1].text.strip() except: continue if tdcontent == 'Sitzung:': session.identifier = nextcontent elif tdcontent == 'Gremium:': session.committee_name = nextcontent elif tdcontent == 'Datum:': datestring = nextcontent if tds[n + 2].text == 'Zeit:': if (n + 3) in tds and tds[n + 3].text is not None: datestring + ' ' + tds[n + 3].text session.date_start = datestring elif tdcontent == 'Raum:': session.address = " ".join(tds[n + 1].xpath('./text()')) elif tdcontent == 'Bezeichnung:': session.description = nextcontent if not hasattr(session, 'identifier'): logging.critical( 'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD' ) raise TemplateError( 'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD' ) # Agendaitems found_attachments = [] rows = dom.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS']) if len(rows) == 0: logging.critical( 'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS') raise TemplateError( 'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS') else: agendaitems = {} agendaitem_id = None public = True for row in rows: row_id = row.get('id') row_classes = row.get('class').split(' ') fields = row.xpath('td') number = fields[0].xpath('./text()') if len(number) > 0: number = number[0] if number == []: number = None #print "number: %s" % number if row_id is not None: # Agendaitem main row agendaitem_id = row_id.rsplit('_', 1)[1] agendaitems[agendaitem_id] = {} agendaitems[agendaitem_id]['id'] = int(agendaitem_id) if number is not None: agendaitems[agendaitem_id]['number'] = number agendaitems[agendaitem_id]['subject'] = "; ".join( fields[1].xpath('./text()')) agendaitems[agendaitem_id]['public'] = public # submission links links = row.xpath( self. xpath['SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK']) submissions = [] for link in links: href = link.get('href') if href is None: continue parsed = parse.search( self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) if parsed is not None: submission = Submission(numeric_id=int( parsed['submission_id']), identifier=link.text) submissions.append(submission) # Add submission to submission queue if hasattr(self, 'submission_queue'): self.submission_queue.add( int(parsed['submission_id'])) if len(submissions): agendaitems[agendaitem_id]['submissions'] = submissions """ Note: we don't scrape agendaitem-related attachments for now, based on the assumption that they are all found via submission detail pages. All we do here is get a list of attachment IDs in found_attachments """ #attachments = [] forms = row.xpath('.//form') for form in forms: for hidden_field in form.xpath('input'): if hidden_field.get('name') != 'DT': continue attachment_id = hidden_field.get('value') #attachments.append(attachment_id) found_attachments.append(attachment_id) #if len(attachments): # agendaitems[agendaitem_id]['attachments'] = attachments elif 'smc_tophz' in row_classes: # additional (optional row for agendaitem) label = fields[1].text value = fields[2].text if label is not None and value is not None: label = label.strip() value = value.strip() #print (label, value) if label in ['Ergebnis:', 'Beschluss:']: if value in self.config.RESULT_STRINGS: agendaitems[agendaitem_id][ 'result'] = self.config.RESULT_STRINGS[ value] else: logging.warn( "String '%s' not found in configured RESULT_STRINGS", value) if self.options.verbose: print "WARNING: String '%s' not found in RESULT_STRINGS\n" % value agendaitems[agendaitem_id]['result'] = value elif label == 'Bemerkung:': agendaitems[agendaitem_id]['result_note'] = value elif label == 'Abstimmung:': agendaitems[agendaitem_id]['voting'] = value else: logging.critical( "Agendaitem info label '%s' is unknown", label) raise ValueError( 'Agendaitem info label "%s" is unknown' % label) elif 'smcrowh' in row_classes: # Subheading (public / nonpublic part) if fields[ 0].text is not None and "Nicht öffentlich" in fields[ 0].text.encode('utf-8'): public = False #print json.dumps(agendaitems, indent=2) session.agendaitems = agendaitems.values() # session-related attachments containers = dom.xpath(self.xpath['SESSION_DETAIL_ATTACHMENTS']) for container in containers: classes = container.get('class') if classes is None: continue classes = classes.split(' ') if self.xpath[ 'SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes: continue attachments = [] rows = container.xpath('.//tr') for row in rows: forms = row.xpath('.//form') for form in forms: #print "Form: ", form name = " ".join(row.xpath('./td/text()')).strip() for hidden_field in form.xpath('input'): if hidden_field.get('name') != 'DT': continue attachment_id = hidden_field.get('value') # make sure to add only those which aren't agendaitem-related if attachment_id not in found_attachments: attachment = Attachment(identifier=attachment_id, name=name) # Traversing the whole mechanize response to submit this form for mform in mechanize_forms: #print "Form found: '%s'" % mform for control in mform.controls: if control.name == 'DT' and control.value == attachment_id: #print "Found matching form: ", control.name, control.value attachment = self.get_attachment_file( attachment, mform) attachments.append(attachment) found_attachments.append(attachment_id) if len(attachments): session.attachments = attachments oid = self.db.save_session(session) if self.options.verbose: logging.info("Session %d stored with _id %s", session_id, oid)
def get_session(self, session_url=None, session_id=None): """ Load session details for the given detail page URL or numeric ID """ # Read either session_id or session_url from the opposite if session_id is not None: session_url = self.urls['SESSION_DETAIL_PRINT_PATTERN'] % session_id elif session_url is not None: parsed = parse.search(self.urls['SESSION_DETAIL_PARSE_PATTERN'], session_url) session_id = parsed['session_id'] logging.info("Getting session %d from %s", session_id, session_url) session = Session(numeric_id=session_id) time.sleep(self.config.WAIT_TIME) response = self.user_agent.open(session_url) # forms for later attachment download mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False) # seek(0) is necessary to reset response pointer. response.seek(0) html = response.read() html = html.replace(' ', ' ') parser = etree.HTMLParser() dom = etree.parse(StringIO(html), parser) # check for page errors try: page_title = dom.xpath('//h1')[0].text if 'Fehlermeldung' in page_title: logging.info("Page %s cannot be accessed due to server error", session_url) if self.options.verbose: print "Page %s cannot be accessed due to server error" % session_url return if 'Berechtigungsfehler' in page_title: logging.info("Page %s cannot be accessed due to permissions", session_url) if self.options.verbose: print "Page %s cannot be accessed due to permissions" % session_url return except: pass try: error_h3 = dom.xpath('//h3[@class="smc_h3"]')[0].text.strip() if 'Keine Daten gefunden' in error_h3: logging.info("Page %s does not contain any agenda items", session_url) if self.options.verbose: print "Page %s does not contain agenda items" % session_url return except: pass session.original_url = session_url # Session title try: session.title = dom.xpath(self.xpath['SESSION_DETAIL_TITLE'])[0].text except: logging.critical('Cannot find session title element using XPath SESSION_DETAIL_TITLE') raise TemplateError('Cannot find session title element using XPath SESSION_DETAIL_TITLE') # Committe link try: links = dom.xpath(self.xpath['SESSION_DETAIL_COMMITTEE_LINK']) for link in links: href = link.get('href') parsed = parse.search(self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href) if parsed is not None: session.committee_id = parsed['committee_id'] except: logging.critical('Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH') raise TemplateError('Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH') # Session identifier, date, address etc tds = dom.xpath(self.xpath['SESSION_DETAIL_IDENTIFIER_TD']) if len(tds) == 0: logging.critical('Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH') raise TemplateError('Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH') else: for n in range(0, len(tds)): try: tdcontent = tds[n].text.strip() nextcontent = tds[n + 1].text.strip() except: continue if tdcontent == 'Sitzung:': session.identifier = nextcontent elif tdcontent == 'Gremium:': session.committee_name = nextcontent elif tdcontent == 'Datum:': datestring = nextcontent if tds[n + 2].text == 'Zeit:': if (n + 3) in tds and tds[n + 3].text is not None: datestring + ' ' + tds[n + 3].text session.date_start = datestring elif tdcontent == 'Raum:': session.address = " ".join(tds[n + 1].xpath('./text()')) elif tdcontent == 'Bezeichnung:': session.description = nextcontent if not hasattr(session, 'identifier'): logging.critical('Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD') raise TemplateError('Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD') # Agendaitems found_attachments = [] rows = dom.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS']) if len(rows) == 0: logging.critical('Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS') raise TemplateError('Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS') else: agendaitems = {} agendaitem_id = None public = True for row in rows: row_id = row.get('id') row_classes = row.get('class').split(' ') fields = row.xpath('td') number = fields[0].xpath('./text()') if len(number) > 0: number = number[0] if number == []: number = None #print "number: %s" % number if row_id is not None: # Agendaitem main row agendaitem_id = row_id.rsplit('_', 1)[1] agendaitems[agendaitem_id] = {} agendaitems[agendaitem_id]['id'] = int(agendaitem_id) if number is not None: agendaitems[agendaitem_id]['number'] = number agendaitems[agendaitem_id]['subject'] = "; ".join(fields[1].xpath('./text()')) agendaitems[agendaitem_id]['public'] = public # submission links links = row.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK']) submissions = [] for link in links: href = link.get('href') if href is None: continue parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) if parsed is not None: submission = Submission(numeric_id=int(parsed['submission_id']), identifier=link.text) submissions.append(submission) # Add submission to submission queue if hasattr(self, 'submission_queue'): self.submission_queue.add(int(parsed['submission_id'])) if len(submissions): agendaitems[agendaitem_id]['submissions'] = submissions """ Note: we don't scrape agendaitem-related attachments for now, based on the assumption that they are all found via submission detail pages. All we do here is get a list of attachment IDs in found_attachments """ #attachments = [] forms = row.xpath('.//form') for form in forms: for hidden_field in form.xpath('input'): if hidden_field.get('name') != 'DT': continue attachment_id = hidden_field.get('value') #attachments.append(attachment_id) found_attachments.append(attachment_id) #if len(attachments): # agendaitems[agendaitem_id]['attachments'] = attachments elif 'smc_tophz' in row_classes: # additional (optional row for agendaitem) label = fields[1].text value = fields[2].text if label is not None and value is not None: label = label.strip() value = value.strip() #print (label, value) if label in ['Ergebnis:', 'Beschluss:']: if value in self.config.RESULT_STRINGS: agendaitems[agendaitem_id]['result'] = self.config.RESULT_STRINGS[value] else: logging.warn("String '%s' not found in configured RESULT_STRINGS", value) if self.options.verbose: print "WARNING: String '%s' not found in RESULT_STRINGS\n" % value agendaitems[agendaitem_id]['result'] = value elif label == 'Bemerkung:': agendaitems[agendaitem_id]['result_note'] = value elif label == 'Abstimmung:': agendaitems[agendaitem_id]['voting'] = value else: logging.critical("Agendaitem info label '%s' is unknown", label) raise ValueError('Agendaitem info label "%s" is unknown' % label) elif 'smcrowh' in row_classes: # Subheading (public / nonpublic part) if fields[0].text is not None and "Nicht öffentlich" in fields[0].text.encode('utf-8'): public = False #print json.dumps(agendaitems, indent=2) session.agendaitems = agendaitems.values() # session-related attachments containers = dom.xpath(self.xpath['SESSION_DETAIL_ATTACHMENTS']) for container in containers: classes = container.get('class') if classes is None: continue classes = classes.split(' ') if self.xpath['SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes: continue attachments = [] rows = container.xpath('.//tr') for row in rows: forms = row.xpath('.//form') for form in forms: #print "Form: ", form name = " ".join(row.xpath('./td/text()')).strip() for hidden_field in form.xpath('input'): if hidden_field.get('name') != 'DT': continue attachment_id = hidden_field.get('value') # make sure to add only those which aren't agendaitem-related if attachment_id not in found_attachments: attachment = Attachment( identifier=attachment_id, name=name ) # Traversing the whole mechanize response to submit this form for mform in mechanize_forms: #print "Form found: '%s'" % mform for control in mform.controls: if control.name == 'DT' and control.value == attachment_id: #print "Found matching form: ", control.name, control.value attachment = self.get_attachment_file(attachment, mform) attachments.append(attachment) found_attachments.append(attachment_id) if len(attachments): session.attachments = attachments oid = self.db.save_session(session) if self.options.verbose: logging.info("Session %d stored with _id %s", session_id, oid)