コード例 #1
0
    def parse_inquiry_response(self, response):
        """
        Callback function for parsing the inquiry responses
        """
        inquiry_item = response.meta.get('inquiry_item',None) # allow testing single urls for parsing errors
        source_link = response.url
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.RESPONSEDESCRIPTION.xt(response)
        LLP = inquiry_item.legislative_period if inquiry_item else None
        category = INQUIRY.CATEGORY.xt(response)

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))),level=log.DEBUG)

        try:
            sender_object = Person.objects.get(
                parl_id=INQUIRY.RESPONSESENDER.xt(response))
        except Exception, e:
            log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.RESPONSESENDER.xt(response), parl_id, LLP)))
            return
コード例 #2
0
ファイル: petitions.py プロジェクト: vmnet04/OffenesParlament
    def parse_signatures(self, response):
        """
        Parse the public signatures
        """
        petition = response.meta['petition_item']

        signatures = PETITION.SIGNATURES.xt(response)
        log.msg(u"Creating or updating {} signatures".format(
            green(u'{}'.format(len(signatures)))),
                level=log.INFO)

        # find latest saved signature date
        last_signature_date = datetime.date.fromtimestamp(0)
        try:
            last_signature_date = petition.petition_signatures.latest(
                'date').date
            self.logger.debug(u'Latest signature date saved: {}'.format(
                green(u'{}'.format(last_signature_date))))
        except:
            log.warning(u'No latest signature date found')

        count_created = 0
        count_bulk_create = 0

        # signatures on the latest saved date
        signatures_ondate = [
            sig for sig in signatures if sig['date'] == last_signature_date
        ]
        for signature in signatures_ondate:
            petition_signature, created = PetitionSignature.objects.get_or_create(
                petition=petition, **signature)
            if created:
                count_created += 1

        signatures_afterdate = [
            sig for sig in signatures if sig['date'] > last_signature_date
        ]
        # remove duplicates as pre-processing step for bulk_create
        # code for de-duplication for list of dicts used from:
        # http://stackoverflow.com/a/6281063/331559
        signatures_afterdate = [
            dict(y) for y in set(
                tuple(x.items()) for x in signatures_afterdate)
        ]
        signature_items = []
        for signature in signatures_afterdate:
            signature_item = PetitionSignature(petition=petition, **signature)
            signature_items.append(signature_item)
            count_bulk_create += 1

        PetitionSignature.objects.bulk_create(signature_items)

        self.logger.debug(u"Created {} and bulk created {} signatures".format(
            green(u'{}'.format(count_created)),
            green(u'{}'.format(count_bulk_create))))
コード例 #3
0
    def parse_parliament_steps(self, response):
        """
        Callback function to parse the additional 'Parlamentarisches Verfahren'
        page.
        """
        inquiry_item = response.meta['inquiry_item']

        phases = INQUIRY.PHASES.xt(response)

        for phase in phases:
            # Create phase if we don't have it yet
            phase_item, created = Phase.objects.get_or_create(
                title=phase['title'])
            if created:
                log.msg(u"Created Phase {}".format(
                    green(u'[{}]'.format(phase_item.title))),level=log.DEBUG)

            # Create steps
            for step in phase['steps']:
                step_item, created = Step.objects.update_or_create(
                    title=step['title']['text'],
                    sortkey=step['sortkey'],
                    date=step['date'],
                    protocol_url=step['protocol_url'],
                    law=inquiry_item,
                    phase=phase_item,
                    source_link=response.url
                )
                step_item.save()
                if created:
                    log.msg(u"Created Step {}".format(
                        green(u'[{}]'.format(step_item.title))),level=log.DEBUG)

                # Save statements for this step, if applicable
                if 'statements' in step['title']:
                    for stmnt in step['title']['statements']:
                        # Find the person
                        pq = Person.objects.filter(
                            source_link__endswith=stmnt['person_source_link'])
                        if pq.exists() and pq.count() == 1:
                            person_item = pq.first()
                            st_data = {
                                'speech_type': stmnt['statement_type'],
                                'protocol_url': stmnt['protocol_link']
                            }
                            st_item, st_created = Statement.objects.update_or_create(
                                index=stmnt['index'],
                                person=person_item,
                                step=step_item,
                                defaults=st_data)
                            if st_created:
                                log.msg(u"Created Statement by {} on {}".format(
                                    green(
                                        u'[{}]'.format(person_item.full_name)),
                                    step_item.date),level=log.DEBUG)
                            else:
                                log.msg(u"Updated Statement by {} on {}".format(
                                    green(
                                        u'[{}]'.format(person_item.full_name)),
                                    step_item.date),level=log.DEBUG)
                        else:
                            # We can't save statements if we can't find the
                            # Person
                            log.warning(
                                red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}").format(
                                    green(
                                        u'[{}]'.format(stmnt['person_name'])),
                                    blue(
                                        "[{}]".format(stmnt['person_source_link'])),
                                    red("{}").format(
                                        "" if pq.exists() else " not"),
                                    "" if pq.count() > 1 else ", but {} persons matching found!".format(
                                        pq.count())
                                ))
                            continue
コード例 #4
0
    def parse(self, response):
        self.SCRAPED_COUNTER += 1

        source_link = response.url
        category = INQUIRY.CATEGORY.xt(response)
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.DESCRIPTION.xt(response)
        sender_objects = []
        callback_requests = []
        ts = GENERIC.TIMESTAMP.xt(response)

        # Inquiries from Bundesrat don't have an LLP => set None
        if("BR" in category):
            LLP = None
        else:
            LLP = LegislativePeriod.objects.get(
                roman_numeral=response.url.split('/')[-4])
        if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts):
            self.logger.debug(
                green(u"[{} of {}] Skipping Inquiry, no changes: {}".format(
                    self.SCRAPED_COUNTER,
                    self.TOTAL_COUNTER,
                    title)))
            return

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.debug(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # An inquiry can have multiple senders, but only a single recipient.
        # Try/catch in case person does not exist in the database.
        try:
            for sender_object in INQUIRY.SENDER.xt(response):
                sender_objects.append(Person.objects.get(
                    parl_id=sender_object))
        except:
            log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.SENDER.xt(response), parl_id, LLP)))
            return
        try:
            receiver_object = Person.objects.get(
                parl_id=INQUIRY.RECEIVER.xt(response))
        except:
            log.warning(red(u'Receiver "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.RECEIVER.xt(response), parl_id, LLP)))
            return

        # Create or update Inquiry item
        inquiry_item, inquiry_created = Inquiry.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'receiver': receiver_object,
                'ts': ts
            }
        )

        if inquiry_created:
            inquiry_item.status = 'offen'

        # Attach foreign keys
        inquiry_item.keywords = self.parse_keywords(response)
        inquiry_item.documents = self.parse_docs(response)
        inquiry_item.category = cat
        inquiry_item.sender = sender_objects

        response.meta['inquiry_item'] = inquiry_item

        # Dringliche / Urgent inquiries have a different structure for steps
        # and history. This case distinction accomodates these different
        # structures.
        if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()):
            if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
                self.parse_parliament_steps(response)
        else:
            response_link = self.parse_steps(response)
            if response_link:
                post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link),
                                          callback=self.parse_inquiry_response,
                                          dont_filter=True)
                post_req.meta['inquiry_item'] = inquiry_item

                callback_requests.append(post_req)

        # Save Inquiry item and log to terminal if created or updated.
        inquiry_item.save()

        if inquiry_created:
            logtext = u"[{} of {}] Created Inquiry {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated Inquiry {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(
            self.SCRAPED_COUNTER,
            self.TOTAL_COUNTER,
            cyan(title),
            cyan(u"{}".format(parl_id)),
            green(unicode(LLP)),
            blue(response.url),
            green(u"{}".format(inquiry_item.keywords))
        )
        log.msg(logtext, level=log.DEBUG if not self.SCRAPED_COUNTER%1000==0 else log.INFO)

        # log.msg(green("Open Callback requests: {}".format(
        #   len(callback_requests))), level=log.INFO)

        return callback_requests