Example #1
0
    def get_session(self, session_url=None, session_id=None):
        """
        Load session details for the given detail page URL or numeric ID
        """
        # Read either session_id or session_url from the opposite
        if session_id is not None:
            session_url = self.urls['SESSION_DETAIL_PRINT_PATTERN'] % session_id
        elif session_url is not None:
            parsed = parse.search(self.urls['SESSION_DETAIL_PARSE_PATTERN'],
                                  session_url)
            session_id = parsed['session_id']

        logging.info("Getting session %d from %s", session_id, session_url)

        session = Session(numeric_id=session_id)

        time.sleep(self.config.WAIT_TIME)
        response = self.user_agent.open(session_url)
        # forms for later attachment download
        mechanize_forms = mechanize.ParseResponse(response,
                                                  backwards_compat=False)
        # seek(0) is necessary to reset response pointer.
        response.seek(0)
        html = response.read()
        html = html.replace(' ', ' ')
        parser = etree.HTMLParser()
        dom = etree.parse(StringIO(html), parser)

        # check for page errors
        try:
            page_title = dom.xpath('//h1')[0].text
            if 'Fehlermeldung' in page_title:
                logging.info("Page %s cannot be accessed due to server error",
                             session_url)
                if self.options.verbose:
                    print "Page %s cannot be accessed due to server error" % session_url
                return
            if 'Berechtigungsfehler' in page_title:
                logging.info("Page %s cannot be accessed due to permissions",
                             session_url)
                if self.options.verbose:
                    print "Page %s cannot be accessed due to permissions" % session_url
                return
        except:
            pass
        try:
            error_h3 = dom.xpath('//h3[@class="smc_h3"]')[0].text.strip()
            if 'Keine Daten gefunden' in error_h3:
                logging.info("Page %s does not contain any agenda items",
                             session_url)
                if self.options.verbose:
                    print "Page %s does not contain agenda items" % session_url
                return
        except:
            pass

        session.original_url = session_url

        # Session title
        try:
            session.title = dom.xpath(
                self.xpath['SESSION_DETAIL_TITLE'])[0].text
        except:
            logging.critical(
                'Cannot find session title element using XPath SESSION_DETAIL_TITLE'
            )
            raise TemplateError(
                'Cannot find session title element using XPath SESSION_DETAIL_TITLE'
            )

        # Committe link
        try:
            links = dom.xpath(self.xpath['SESSION_DETAIL_COMMITTEE_LINK'])
            for link in links:
                href = link.get('href')
                parsed = parse.search(
                    self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href)
                if parsed is not None:
                    session.committee_id = parsed['committee_id']
        except:
            logging.critical(
                'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH'
            )
            raise TemplateError(
                'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH'
            )

        # Session identifier, date, address etc
        tds = dom.xpath(self.xpath['SESSION_DETAIL_IDENTIFIER_TD'])
        if len(tds) == 0:
            logging.critical(
                'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH'
            )
            raise TemplateError(
                'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH'
            )
        else:
            for n in range(0, len(tds)):
                try:
                    tdcontent = tds[n].text.strip()
                    nextcontent = tds[n + 1].text.strip()
                except:
                    continue
                if tdcontent == 'Sitzung:':
                    session.identifier = nextcontent
                elif tdcontent == 'Gremium:':
                    session.committee_name = nextcontent
                elif tdcontent == 'Datum:':
                    datestring = nextcontent
                    if tds[n + 2].text == 'Zeit:':
                        if (n + 3) in tds and tds[n + 3].text is not None:
                            datestring + ' ' + tds[n + 3].text
                    session.date_start = datestring
                elif tdcontent == 'Raum:':
                    session.address = " ".join(tds[n + 1].xpath('./text()'))
                elif tdcontent == 'Bezeichnung:':
                    session.description = nextcontent
            if not hasattr(session, 'identifier'):
                logging.critical(
                    'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD'
                )
                raise TemplateError(
                    'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD'
                )

        # Agendaitems
        found_attachments = []
        rows = dom.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS'])
        if len(rows) == 0:
            logging.critical(
                'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
            raise TemplateError(
                'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
        else:
            agendaitems = {}
            agendaitem_id = None
            public = True
            for row in rows:
                row_id = row.get('id')
                row_classes = row.get('class').split(' ')
                fields = row.xpath('td')
                number = fields[0].xpath('./text()')
                if len(number) > 0:
                    number = number[0]
                if number == []:
                    number = None
                #print "number: %s" % number
                if row_id is not None:
                    # Agendaitem main row
                    agendaitem_id = row_id.rsplit('_', 1)[1]
                    agendaitems[agendaitem_id] = {}
                    agendaitems[agendaitem_id]['id'] = int(agendaitem_id)
                    if number is not None:
                        agendaitems[agendaitem_id]['number'] = number
                    agendaitems[agendaitem_id]['subject'] = "; ".join(
                        fields[1].xpath('./text()'))
                    agendaitems[agendaitem_id]['public'] = public
                    # submission links
                    links = row.xpath(
                        self.
                        xpath['SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK'])
                    submissions = []
                    for link in links:
                        href = link.get('href')
                        if href is None:
                            continue
                        parsed = parse.search(
                            self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
                        if parsed is not None:
                            submission = Submission(numeric_id=int(
                                parsed['submission_id']),
                                                    identifier=link.text)
                            submissions.append(submission)
                            # Add submission to submission queue
                            if hasattr(self, 'submission_queue'):
                                self.submission_queue.add(
                                    int(parsed['submission_id']))
                    if len(submissions):
                        agendaitems[agendaitem_id]['submissions'] = submissions
                    """
                    Note: we don't scrape agendaitem-related attachments for now,
                    based on the assumption that they are all found via submission
                    detail pages. All we do here is get a list of attachment IDs
                    in found_attachments
                    """
                    #attachments = []
                    forms = row.xpath('.//form')
                    for form in forms:
                        for hidden_field in form.xpath('input'):
                            if hidden_field.get('name') != 'DT':
                                continue
                            attachment_id = hidden_field.get('value')
                            #attachments.append(attachment_id)
                            found_attachments.append(attachment_id)
                    #if len(attachments):
                    #    agendaitems[agendaitem_id]['attachments'] = attachments

                elif 'smc_tophz' in row_classes:
                    # additional (optional row for agendaitem)
                    label = fields[1].text
                    value = fields[2].text
                    if label is not None and value is not None:
                        label = label.strip()
                        value = value.strip()
                        #print (label, value)
                        if label in ['Ergebnis:', 'Beschluss:']:
                            if value in self.config.RESULT_STRINGS:
                                agendaitems[agendaitem_id][
                                    'result'] = self.config.RESULT_STRINGS[
                                        value]
                            else:
                                logging.warn(
                                    "String '%s' not found in configured RESULT_STRINGS",
                                    value)
                                if self.options.verbose:
                                    print "WARNING: String '%s' not found in RESULT_STRINGS\n" % value
                                agendaitems[agendaitem_id]['result'] = value
                        elif label == 'Bemerkung:':
                            agendaitems[agendaitem_id]['result_note'] = value
                        elif label == 'Abstimmung:':
                            agendaitems[agendaitem_id]['voting'] = value
                        else:
                            logging.critical(
                                "Agendaitem info label '%s' is unknown", label)
                            raise ValueError(
                                'Agendaitem info label "%s" is unknown' %
                                label)

                elif 'smcrowh' in row_classes:
                    # Subheading (public / nonpublic part)
                    if fields[
                            0].text is not None and "Nicht öffentlich" in fields[
                                0].text.encode('utf-8'):
                        public = False
            #print json.dumps(agendaitems, indent=2)
            session.agendaitems = agendaitems.values()

        # session-related attachments
        containers = dom.xpath(self.xpath['SESSION_DETAIL_ATTACHMENTS'])
        for container in containers:
            classes = container.get('class')
            if classes is None:
                continue
            classes = classes.split(' ')
            if self.xpath[
                    'SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
                continue
            attachments = []
            rows = container.xpath('.//tr')
            for row in rows:
                forms = row.xpath('.//form')
                for form in forms:
                    #print "Form: ", form
                    name = " ".join(row.xpath('./td/text()')).strip()
                    for hidden_field in form.xpath('input'):
                        if hidden_field.get('name') != 'DT':
                            continue
                        attachment_id = hidden_field.get('value')
                        # make sure to add only those which aren't agendaitem-related
                        if attachment_id not in found_attachments:
                            attachment = Attachment(identifier=attachment_id,
                                                    name=name)
                            # Traversing the whole mechanize response to submit this form
                            for mform in mechanize_forms:
                                #print "Form found: '%s'" % mform
                                for control in mform.controls:
                                    if control.name == 'DT' and control.value == attachment_id:
                                        #print "Found matching form: ", control.name, control.value
                                        attachment = self.get_attachment_file(
                                            attachment, mform)
                            attachments.append(attachment)
                            found_attachments.append(attachment_id)
            if len(attachments):
                session.attachments = attachments

        oid = self.db.save_session(session)
        if self.options.verbose:
            logging.info("Session %d stored with _id %s", session_id, oid)
Example #2
0
    def get_session(self, session_url=None, session_id=None):
        """
        Load session details for the given detail page URL or numeric ID
        """
        # Read either session_id or session_url from the opposite
        if session_id is not None:
            session_url = self.urls['SESSION_DETAIL_PRINT_PATTERN'] % session_id
        elif session_url is not None:
            parsed = parse.search(self.urls['SESSION_DETAIL_PARSE_PATTERN'], session_url)
            session_id = parsed['session_id']

        logging.info("Getting session %d from %s", session_id, session_url)

        session = Session(numeric_id=session_id)

        time.sleep(self.config.WAIT_TIME)
        response = self.user_agent.open(session_url)
        # forms for later attachment download
        mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False)
        # seek(0) is necessary to reset response pointer.
        response.seek(0)
        html = response.read()
        html = html.replace(' ', ' ')
        parser = etree.HTMLParser()
        dom = etree.parse(StringIO(html), parser)

        # check for page errors
        try:
            page_title = dom.xpath('//h1')[0].text
            if 'Fehlermeldung' in page_title:
                logging.info("Page %s cannot be accessed due to server error", session_url)
                if self.options.verbose:
                    print "Page %s cannot be accessed due to server error" % session_url
                return
            if 'Berechtigungsfehler' in page_title:
                logging.info("Page %s cannot be accessed due to permissions", session_url)
                if self.options.verbose:
                    print "Page %s cannot be accessed due to permissions" % session_url
                return
        except:
            pass
        try:
            error_h3 = dom.xpath('//h3[@class="smc_h3"]')[0].text.strip()
            if 'Keine Daten gefunden' in error_h3:
                logging.info("Page %s does not contain any agenda items", session_url)
                if self.options.verbose:
                    print "Page %s does not contain agenda items" % session_url
                return
        except:
            pass

        session.original_url = session_url

        # Session title
        try:
            session.title = dom.xpath(self.xpath['SESSION_DETAIL_TITLE'])[0].text
        except:
            logging.critical('Cannot find session title element using XPath SESSION_DETAIL_TITLE')
            raise TemplateError('Cannot find session title element using XPath SESSION_DETAIL_TITLE')

        # Committe link
        try:
            links = dom.xpath(self.xpath['SESSION_DETAIL_COMMITTEE_LINK'])
            for link in links:
                href = link.get('href')
                parsed = parse.search(self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href)
                if parsed is not None:
                    session.committee_id = parsed['committee_id']
        except:
            logging.critical('Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH')
            raise TemplateError('Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH')

        # Session identifier, date, address etc
        tds = dom.xpath(self.xpath['SESSION_DETAIL_IDENTIFIER_TD'])
        if len(tds) == 0:
            logging.critical('Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH')
            raise TemplateError('Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH')
        else:
            for n in range(0, len(tds)):
                try:
                    tdcontent = tds[n].text.strip()
                    nextcontent = tds[n + 1].text.strip()
                except:
                    continue
                if tdcontent == 'Sitzung:':
                    session.identifier = nextcontent
                elif tdcontent == 'Gremium:':
                    session.committee_name = nextcontent
                elif tdcontent == 'Datum:':
                    datestring = nextcontent
                    if tds[n + 2].text == 'Zeit:':
                        if (n + 3) in tds and tds[n + 3].text is not None:
                            datestring + ' ' + tds[n + 3].text
                    session.date_start = datestring
                elif tdcontent == 'Raum:':
                    session.address = " ".join(tds[n + 1].xpath('./text()'))
                elif tdcontent == 'Bezeichnung:':
                    session.description = nextcontent
            if not hasattr(session, 'identifier'):
                logging.critical('Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD')
                raise TemplateError('Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD')

        # Agendaitems
        found_attachments = []
        rows = dom.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS'])
        if len(rows) == 0:
            logging.critical('Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
            raise TemplateError('Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
        else:
            agendaitems = {}
            agendaitem_id = None
            public = True
            for row in rows:
                row_id = row.get('id')
                row_classes = row.get('class').split(' ')
                fields = row.xpath('td')
                number = fields[0].xpath('./text()')
                if len(number) > 0:
                    number = number[0]
                if number == []:
                    number = None
                #print "number: %s" % number
                if row_id is not None:
                    # Agendaitem main row
                    agendaitem_id = row_id.rsplit('_', 1)[1]
                    agendaitems[agendaitem_id] = {}
                    agendaitems[agendaitem_id]['id'] = int(agendaitem_id)
                    if number is not None:
                        agendaitems[agendaitem_id]['number'] = number
                    agendaitems[agendaitem_id]['subject'] = "; ".join(fields[1].xpath('./text()'))
                    agendaitems[agendaitem_id]['public'] = public
                    # submission links
                    links = row.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK'])
                    submissions = []
                    for link in links:
                        href = link.get('href')
                        if href is None:
                            continue
                        parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
                        if parsed is not None:
                            submission = Submission(numeric_id=int(parsed['submission_id']),
                                                    identifier=link.text)
                            submissions.append(submission)
                            # Add submission to submission queue
                            if hasattr(self, 'submission_queue'):
                                self.submission_queue.add(int(parsed['submission_id']))
                    if len(submissions):
                        agendaitems[agendaitem_id]['submissions'] = submissions
                    """
                    Note: we don't scrape agendaitem-related attachments for now,
                    based on the assumption that they are all found via submission
                    detail pages. All we do here is get a list of attachment IDs
                    in found_attachments
                    """
                    #attachments = []
                    forms = row.xpath('.//form')
                    for form in forms:
                        for hidden_field in form.xpath('input'):
                            if hidden_field.get('name') != 'DT':
                                continue
                            attachment_id = hidden_field.get('value')
                            #attachments.append(attachment_id)
                            found_attachments.append(attachment_id)
                    #if len(attachments):
                    #    agendaitems[agendaitem_id]['attachments'] = attachments

                elif 'smc_tophz' in row_classes:
                    # additional (optional row for agendaitem)
                    label = fields[1].text
                    value = fields[2].text
                    if label is not None and value is not None:
                        label = label.strip()
                        value = value.strip()
                        #print (label, value)
                        if label in ['Ergebnis:', 'Beschluss:']:
                            if value in self.config.RESULT_STRINGS:
                                agendaitems[agendaitem_id]['result'] = self.config.RESULT_STRINGS[value]
                            else:
                                logging.warn("String '%s' not found in configured RESULT_STRINGS", value)
                                if self.options.verbose:
                                    print "WARNING: String '%s' not found in RESULT_STRINGS\n" % value
                                agendaitems[agendaitem_id]['result'] = value
                        elif label == 'Bemerkung:':
                            agendaitems[agendaitem_id]['result_note'] = value
                        elif label == 'Abstimmung:':
                            agendaitems[agendaitem_id]['voting'] = value
                        else:
                            logging.critical("Agendaitem info label '%s' is unknown", label)
                            raise ValueError('Agendaitem info label "%s" is unknown' % label)

                elif 'smcrowh' in row_classes:
                    # Subheading (public / nonpublic part)
                    if fields[0].text is not None and "Nicht öffentlich" in fields[0].text.encode('utf-8'):
                        public = False
            #print json.dumps(agendaitems, indent=2)
            session.agendaitems = agendaitems.values()

        # session-related attachments
        containers = dom.xpath(self.xpath['SESSION_DETAIL_ATTACHMENTS'])
        for container in containers:
            classes = container.get('class')
            if classes is None:
                continue
            classes = classes.split(' ')
            if self.xpath['SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
                continue
            attachments = []
            rows = container.xpath('.//tr')
            for row in rows:
                forms = row.xpath('.//form')
                for form in forms:
                    #print "Form: ", form
                    name = " ".join(row.xpath('./td/text()')).strip()
                    for hidden_field in form.xpath('input'):
                        if hidden_field.get('name') != 'DT':
                            continue
                        attachment_id = hidden_field.get('value')
                        # make sure to add only those which aren't agendaitem-related
                        if attachment_id not in found_attachments:
                            attachment = Attachment(
                                identifier=attachment_id,
                                name=name
                            )
                            # Traversing the whole mechanize response to submit this form
                            for mform in mechanize_forms:
                                #print "Form found: '%s'" % mform
                                for control in mform.controls:
                                    if control.name == 'DT' and control.value == attachment_id:
                                        #print "Found matching form: ", control.name, control.value
                                        attachment = self.get_attachment_file(attachment, mform)
                            attachments.append(attachment)
                            found_attachments.append(attachment_id)
            if len(attachments):
                session.attachments = attachments

        oid = self.db.save_session(session)
        if self.options.verbose:
            logging.info("Session %d stored with _id %s", session_id, oid)