Python Session.description Examples

Programming Language: Python

Namespace/Package Name: model.session

Class/Type: Session

Method/Function: description

Examples at hotexamples.com: 2

Python Session.description - 2 examples found. These are the top rated real world Python examples of model.session.Session.description extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Session(17)

from_db_row(3)

query(2)

put(2)

objects(2)

from_binary(1)

subject(1)

save_to_db(1)

proposal(1)

original_url(1)

load_from_db(1)

is_valid(1)

is_authorized(1)

identifier(1)

finish_time(1)

add(1)

find_one_by_text_token(1)

elapse_time(1)

dni(1)

dict_to_vault(1)

description(1)

date_start(1)

committee_name(1)

committee_id(1)

commit(1)

comments(1)

attachments(1)

agendaitems(1)

address(1)

title(1)

Example #1

Show file

    def get_session(self, session_url=None, session_id=None):
        """
        Load session details for the given detail page URL or numeric ID
        """
        # Read either session_id or session_url from the opposite
        if session_id is not None:
            session_url = self.urls['SESSION_DETAIL_PRINT_PATTERN'] % session_id
        elif session_url is not None:
            parsed = parse.search(self.urls['SESSION_DETAIL_PARSE_PATTERN'],
                                  session_url)
            session_id = parsed['session_id']

        logging.info("Getting session %d from %s", session_id, session_url)

        session = Session(numeric_id=session_id)

        time.sleep(self.config.WAIT_TIME)
        response = self.user_agent.open(session_url)
        # forms for later attachment download
        mechanize_forms = mechanize.ParseResponse(response,
                                                  backwards_compat=False)
        # seek(0) is necessary to reset response pointer.
        response.seek(0)
        html = response.read()
        html = html.replace('&nbsp;', ' ')
        parser = etree.HTMLParser()
        dom = etree.parse(StringIO(html), parser)

        # check for page errors
        try:
            page_title = dom.xpath('//h1')[0].text
            if 'Fehlermeldung' in page_title:
                logging.info("Page %s cannot be accessed due to server error",
                             session_url)
                if self.options.verbose:
                    print "Page %s cannot be accessed due to server error" % session_url
                return
            if 'Berechtigungsfehler' in page_title:
                logging.info("Page %s cannot be accessed due to permissions",
                             session_url)
                if self.options.verbose:
                    print "Page %s cannot be accessed due to permissions" % session_url
                return
        except:
            pass
        try:
            error_h3 = dom.xpath('//h3[@class="smc_h3"]')[0].text.strip()
            if 'Keine Daten gefunden' in error_h3:
                logging.info("Page %s does not contain any agenda items",
                             session_url)
                if self.options.verbose:
                    print "Page %s does not contain agenda items" % session_url
                return
        except:
            pass

        session.original_url = session_url

        # Session title
        try:
            session.title = dom.xpath(
                self.xpath['SESSION_DETAIL_TITLE'])[0].text
        except:
            logging.critical(
                'Cannot find session title element using XPath SESSION_DETAIL_TITLE'
            )
            raise TemplateError(
                'Cannot find session title element using XPath SESSION_DETAIL_TITLE'
            )

        # Committe link
        try:
            links = dom.xpath(self.xpath['SESSION_DETAIL_COMMITTEE_LINK'])
            for link in links:
                href = link.get('href')
                parsed = parse.search(
                    self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href)
                if parsed is not None:
                    session.committee_id = parsed['committee_id']
        except:
            logging.critical(
                'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH'
            )
            raise TemplateError(
                'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH'
            )

        # Session identifier, date, address etc
        tds = dom.xpath(self.xpath['SESSION_DETAIL_IDENTIFIER_TD'])
        if len(tds) == 0:
            logging.critical(
                'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH'
            )
            raise TemplateError(
                'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH'
            )
        else:
            for n in range(0, len(tds)):
                try:
                    tdcontent = tds[n].text.strip()
                    nextcontent = tds[n + 1].text.strip()
                except:
                    continue
                if tdcontent == 'Sitzung:':
                    session.identifier = nextcontent
                elif tdcontent == 'Gremium:':
                    session.committee_name = nextcontent
                elif tdcontent == 'Datum:':
                    datestring = nextcontent
                    if tds[n + 2].text == 'Zeit:':
                        if (n + 3) in tds and tds[n + 3].text is not None:
                            datestring + ' ' + tds[n + 3].text
                    session.date_start = datestring
                elif tdcontent == 'Raum:':
                    session.address = " ".join(tds[n + 1].xpath('./text()'))
                elif tdcontent == 'Bezeichnung:':
                    session.description = nextcontent
            if not hasattr(session, 'identifier'):
                logging.critical(
                    'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD'
                )
                raise TemplateError(
                    'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD'
                )

        # Agendaitems
        found_attachments = []
        rows = dom.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS'])
        if len(rows) == 0:
            logging.critical(
                'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
            raise TemplateError(
                'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
        else:
            agendaitems = {}
            agendaitem_id = None
            public = True
            for row in rows:
                row_id = row.get('id')
                row_classes = row.get('class').split(' ')
                fields = row.xpath('td')
                number = fields[0].xpath('./text()')
                if len(number) > 0:
                    number = number[0]
                if number == []:
                    number = None
                #print "number: %s" % number
                if row_id is not None:
                    # Agendaitem main row
                    agendaitem_id = row_id.rsplit('_', 1)[1]
                    agendaitems[agendaitem_id] = {}
                    agendaitems[agendaitem_id]['id'] = int(agendaitem_id)
                    if number is not None:
                        agendaitems[agendaitem_id]['number'] = number
                    agendaitems[agendaitem_id]['subject'] = "; ".join(
                        fields[1].xpath('./text()'))
                    agendaitems[agendaitem_id]['public'] = public
                    # submission links
                    links = row.xpath(
                        self.
                        xpath['SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK'])
                    submissions = []
                    for link in links:
                        href = link.get('href')
                        if href is None:
                            continue
                        parsed = parse.search(
                            self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
                        if parsed is not None:
                            submission = Submission(numeric_id=int(
                                parsed['submission_id']),
                                                    identifier=link.text)
                            submissions.append(submission)
                            # Add submission to submission queue
                            if hasattr(self, 'submission_queue'):
                                self.submission_queue.add(
                                    int(parsed['submission_id']))
                    if len(submissions):
                        agendaitems[agendaitem_id]['submissions'] = submissions
                    """
                    Note: we don't scrape agendaitem-related attachments for now,
                    based on the assumption that they are all found via submission
                    detail pages. All we do here is get a list of attachment IDs
                    in found_attachments
                    """
                    #attachments = []
                    forms = row.xpath('.//form')
                    for form in forms:
                        for hidden_field in form.xpath('input'):
                            if hidden_field.get('name') != 'DT':
                                continue
                            attachment_id = hidden_field.get('value')
                            #attachments.append(attachment_id)
                            found_attachments.append(attachment_id)
                    #if len(attachments):
                    #    agendaitems[agendaitem_id]['attachments'] = attachments

                elif 'smc_tophz' in row_classes:
                    # additional (optional row for agendaitem)
                    label = fields[1].text
                    value = fields[2].text
                    if label is not None and value is not None:
                        label = label.strip()
                        value = value.strip()
                        #print (label, value)
                        if label in ['Ergebnis:', 'Beschluss:']:
                            if value in self.config.RESULT_STRINGS:
                                agendaitems[agendaitem_id][
                                    'result'] = self.config.RESULT_STRINGS[
                                        value]
                            else:
                                logging.warn(
                                    "String '%s' not found in configured RESULT_STRINGS",
                                    value)
                                if self.options.verbose:
                                    print "WARNING: String '%s' not found in RESULT_STRINGS\n" % value
                                agendaitems[agendaitem_id]['result'] = value
                        elif label == 'Bemerkung:':
                            agendaitems[agendaitem_id]['result_note'] = value
                        elif label == 'Abstimmung:':
                            agendaitems[agendaitem_id]['voting'] = value
                        else:
                            logging.critical(
                                "Agendaitem info label '%s' is unknown", label)
                            raise ValueError(
                                'Agendaitem info label "%s" is unknown' %
                                label)

                elif 'smcrowh' in row_classes:
                    # Subheading (public / nonpublic part)
                    if fields[
                            0].text is not None and "Nicht öffentlich" in fields[
                                0].text.encode('utf-8'):
                        public = False
            #print json.dumps(agendaitems, indent=2)
            session.agendaitems = agendaitems.values()

        # session-related attachments
        containers = dom.xpath(self.xpath['SESSION_DETAIL_ATTACHMENTS'])
        for container in containers:
            classes = container.get('class')
            if classes is None:
                continue
            classes = classes.split(' ')
            if self.xpath[
                    'SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
                continue
            attachments = []
            rows = container.xpath('.//tr')
            for row in rows:
                forms = row.xpath('.//form')
                for form in forms:
                    #print "Form: ", form
                    name = " ".join(row.xpath('./td/text()')).strip()
                    for hidden_field in form.xpath('input'):
                        if hidden_field.get('name') != 'DT':
                            continue
                        attachment_id = hidden_field.get('value')
                        # make sure to add only those which aren't agendaitem-related
                        if attachment_id not in found_attachments:
                            attachment = Attachment(identifier=attachment_id,
                                                    name=name)
                            # Traversing the whole mechanize response to submit this form
                            for mform in mechanize_forms:
                                #print "Form found: '%s'" % mform
                                for control in mform.controls:
                                    if control.name == 'DT' and control.value == attachment_id:
                                        #print "Found matching form: ", control.name, control.value
                                        attachment = self.get_attachment_file(
                                            attachment, mform)
                            attachments.append(attachment)
                            found_attachments.append(attachment_id)
            if len(attachments):
                session.attachments = attachments

        oid = self.db.save_session(session)
        if self.options.verbose:
            logging.info("Session %d stored with _id %s", session_id, oid)

Example #2

Show file

File: scraper.py Project: CodeforChemnitz/scrape-a-ris

    def get_session(self, session_url=None, session_id=None):
        """
        Load session details for the given detail page URL or numeric ID
        """
        # Read either session_id or session_url from the opposite
        if session_id is not None:
            session_url = self.urls['SESSION_DETAIL_PRINT_PATTERN'] % session_id
        elif session_url is not None:
            parsed = parse.search(self.urls['SESSION_DETAIL_PARSE_PATTERN'], session_url)
            session_id = parsed['session_id']

        logging.info("Getting session %d from %s", session_id, session_url)

        session = Session(numeric_id=session_id)

        time.sleep(self.config.WAIT_TIME)
        response = self.user_agent.open(session_url)
        # forms for later attachment download
        mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False)
        # seek(0) is necessary to reset response pointer.
        response.seek(0)
        html = response.read()
        html = html.replace('&nbsp;', ' ')
        parser = etree.HTMLParser()
        dom = etree.parse(StringIO(html), parser)

        # check for page errors
        try:
            page_title = dom.xpath('//h1')[0].text
            if 'Fehlermeldung' in page_title:
                logging.info("Page %s cannot be accessed due to server error", session_url)
                if self.options.verbose:
                    print "Page %s cannot be accessed due to server error" % session_url
                return
            if 'Berechtigungsfehler' in page_title:
                logging.info("Page %s cannot be accessed due to permissions", session_url)
                if self.options.verbose:
                    print "Page %s cannot be accessed due to permissions" % session_url
                return
        except:
            pass
        try:
            error_h3 = dom.xpath('//h3[@class="smc_h3"]')[0].text.strip()
            if 'Keine Daten gefunden' in error_h3:
                logging.info("Page %s does not contain any agenda items", session_url)
                if self.options.verbose:
                    print "Page %s does not contain agenda items" % session_url
                return
        except:
            pass

        session.original_url = session_url

        # Session title
        try:
            session.title = dom.xpath(self.xpath['SESSION_DETAIL_TITLE'])[0].text
        except:
            logging.critical('Cannot find session title element using XPath SESSION_DETAIL_TITLE')
            raise TemplateError('Cannot find session title element using XPath SESSION_DETAIL_TITLE')

        # Committe link
        try:
            links = dom.xpath(self.xpath['SESSION_DETAIL_COMMITTEE_LINK'])
            for link in links:
                href = link.get('href')
                parsed = parse.search(self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href)
                if parsed is not None:
                    session.committee_id = parsed['committee_id']
        except:
            logging.critical('Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH')
            raise TemplateError('Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH')

        # Session identifier, date, address etc
        tds = dom.xpath(self.xpath['SESSION_DETAIL_IDENTIFIER_TD'])
        if len(tds) == 0:
            logging.critical('Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH')
            raise TemplateError('Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH')
        else:
            for n in range(0, len(tds)):
                try:
                    tdcontent = tds[n].text.strip()
                    nextcontent = tds[n + 1].text.strip()
                except:
                    continue
                if tdcontent == 'Sitzung:':
                    session.identifier = nextcontent
                elif tdcontent == 'Gremium:':
                    session.committee_name = nextcontent
                elif tdcontent == 'Datum:':
                    datestring = nextcontent
                    if tds[n + 2].text == 'Zeit:':
                        if (n + 3) in tds and tds[n + 3].text is not None:
                            datestring + ' ' + tds[n + 3].text
                    session.date_start = datestring
                elif tdcontent == 'Raum:':
                    session.address = " ".join(tds[n + 1].xpath('./text()'))
                elif tdcontent == 'Bezeichnung:':
                    session.description = nextcontent
            if not hasattr(session, 'identifier'):
                logging.critical('Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD')
                raise TemplateError('Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD')

        # Agendaitems
        found_attachments = []
        rows = dom.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS'])
        if len(rows) == 0:
            logging.critical('Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
            raise TemplateError('Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
        else:
            agendaitems = {}
            agendaitem_id = None
            public = True
            for row in rows:
                row_id = row.get('id')
                row_classes = row.get('class').split(' ')
                fields = row.xpath('td')
                number = fields[0].xpath('./text()')
                if len(number) > 0:
                    number = number[0]
                if number == []:
                    number = None
                #print "number: %s" % number
                if row_id is not None:
                    # Agendaitem main row
                    agendaitem_id = row_id.rsplit('_', 1)[1]
                    agendaitems[agendaitem_id] = {}
                    agendaitems[agendaitem_id]['id'] = int(agendaitem_id)
                    if number is not None:
                        agendaitems[agendaitem_id]['number'] = number
                    agendaitems[agendaitem_id]['subject'] = "; ".join(fields[1].xpath('./text()'))
                    agendaitems[agendaitem_id]['public'] = public
                    # submission links
                    links = row.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK'])
                    submissions = []
                    for link in links:
                        href = link.get('href')
                        if href is None:
                            continue
                        parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
                        if parsed is not None:
                            submission = Submission(numeric_id=int(parsed['submission_id']),
                                                    identifier=link.text)
                            submissions.append(submission)
                            # Add submission to submission queue
                            if hasattr(self, 'submission_queue'):
                                self.submission_queue.add(int(parsed['submission_id']))
                    if len(submissions):
                        agendaitems[agendaitem_id]['submissions'] = submissions
                    """
                    Note: we don't scrape agendaitem-related attachments for now,
                    based on the assumption that they are all found via submission
                    detail pages. All we do here is get a list of attachment IDs
                    in found_attachments
                    """
                    #attachments = []
                    forms = row.xpath('.//form')
                    for form in forms:
                        for hidden_field in form.xpath('input'):
                            if hidden_field.get('name') != 'DT':
                                continue
                            attachment_id = hidden_field.get('value')
                            #attachments.append(attachment_id)
                            found_attachments.append(attachment_id)
                    #if len(attachments):
                    #    agendaitems[agendaitem_id]['attachments'] = attachments

                elif 'smc_tophz' in row_classes:
                    # additional (optional row for agendaitem)
                    label = fields[1].text
                    value = fields[2].text
                    if label is not None and value is not None:
                        label = label.strip()
                        value = value.strip()
                        #print (label, value)
                        if label in ['Ergebnis:', 'Beschluss:']:
                            if value in self.config.RESULT_STRINGS:
                                agendaitems[agendaitem_id]['result'] = self.config.RESULT_STRINGS[value]
                            else:
                                logging.warn("String '%s' not found in configured RESULT_STRINGS", value)
                                if self.options.verbose:
                                    print "WARNING: String '%s' not found in RESULT_STRINGS\n" % value
                                agendaitems[agendaitem_id]['result'] = value
                        elif label == 'Bemerkung:':
                            agendaitems[agendaitem_id]['result_note'] = value
                        elif label == 'Abstimmung:':
                            agendaitems[agendaitem_id]['voting'] = value
                        else:
                            logging.critical("Agendaitem info label '%s' is unknown", label)
                            raise ValueError('Agendaitem info label "%s" is unknown' % label)

                elif 'smcrowh' in row_classes:
                    # Subheading (public / nonpublic part)
                    if fields[0].text is not None and "Nicht öffentlich" in fields[0].text.encode('utf-8'):
                        public = False
            #print json.dumps(agendaitems, indent=2)
            session.agendaitems = agendaitems.values()

        # session-related attachments
        containers = dom.xpath(self.xpath['SESSION_DETAIL_ATTACHMENTS'])
        for container in containers:
            classes = container.get('class')
            if classes is None:
                continue
            classes = classes.split(' ')
            if self.xpath['SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
                continue
            attachments = []
            rows = container.xpath('.//tr')
            for row in rows:
                forms = row.xpath('.//form')
                for form in forms:
                    #print "Form: ", form
                    name = " ".join(row.xpath('./td/text()')).strip()
                    for hidden_field in form.xpath('input'):
                        if hidden_field.get('name') != 'DT':
                            continue
                        attachment_id = hidden_field.get('value')
                        # make sure to add only those which aren't agendaitem-related
                        if attachment_id not in found_attachments:
                            attachment = Attachment(
                                identifier=attachment_id,
                                name=name
                            )
                            # Traversing the whole mechanize response to submit this form
                            for mform in mechanize_forms:
                                #print "Form found: '%s'" % mform
                                for control in mform.controls:
                                    if control.name == 'DT' and control.value == attachment_id:
                                        #print "Found matching form: ", control.name, control.value
                                        attachment = self.get_attachment_file(attachment, mform)
                            attachments.append(attachment)
                            found_attachments.append(attachment_id)
            if len(attachments):
                session.attachments = attachments

        oid = self.db.save_session(session)
        if self.options.verbose:
            logging.info("Session %d stored with _id %s", session_id, oid)