def parse_inquiry_response(self, response): """ Callback function for parsing the inquiry responses """ inquiry_item = response.meta.get('inquiry_item',None) # allow testing single urls for parsing errors source_link = response.url parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.RESPONSEDESCRIPTION.xt(response) LLP = inquiry_item.legislative_period if inquiry_item else None category = INQUIRY.CATEGORY.xt(response) # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category))),level=log.DEBUG) try: sender_object = Person.objects.get( parl_id=INQUIRY.RESPONSESENDER.xt(response)) except Exception, e: log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.RESPONSESENDER.xt(response), parl_id, LLP))) return
def parse_signatures(self, response): """ Parse the public signatures """ petition = response.meta['petition_item'] signatures = PETITION.SIGNATURES.xt(response) log.msg(u"Creating or updating {} signatures".format( green(u'{}'.format(len(signatures)))), level=log.INFO) # find latest saved signature date last_signature_date = datetime.date.fromtimestamp(0) try: last_signature_date = petition.petition_signatures.latest( 'date').date self.logger.debug(u'Latest signature date saved: {}'.format( green(u'{}'.format(last_signature_date)))) except: log.warning(u'No latest signature date found') count_created = 0 count_bulk_create = 0 # signatures on the latest saved date signatures_ondate = [ sig for sig in signatures if sig['date'] == last_signature_date ] for signature in signatures_ondate: petition_signature, created = PetitionSignature.objects.get_or_create( petition=petition, **signature) if created: count_created += 1 signatures_afterdate = [ sig for sig in signatures if sig['date'] > last_signature_date ] # remove duplicates as pre-processing step for bulk_create # code for de-duplication for list of dicts used from: # http://stackoverflow.com/a/6281063/331559 signatures_afterdate = [ dict(y) for y in set( tuple(x.items()) for x in signatures_afterdate) ] signature_items = [] for signature in signatures_afterdate: signature_item = PetitionSignature(petition=petition, **signature) signature_items.append(signature_item) count_bulk_create += 1 PetitionSignature.objects.bulk_create(signature_items) self.logger.debug(u"Created {} and bulk created {} signatures".format( green(u'{}'.format(count_created)), green(u'{}'.format(count_bulk_create))))
def parse_parliament_steps(self, response): """ Callback function to parse the additional 'Parlamentarisches Verfahren' page. """ inquiry_item = response.meta['inquiry_item'] phases = INQUIRY.PHASES.xt(response) for phase in phases: # Create phase if we don't have it yet phase_item, created = Phase.objects.get_or_create( title=phase['title']) if created: log.msg(u"Created Phase {}".format( green(u'[{}]'.format(phase_item.title))),level=log.DEBUG) # Create steps for step in phase['steps']: step_item, created = Step.objects.update_or_create( title=step['title']['text'], sortkey=step['sortkey'], date=step['date'], protocol_url=step['protocol_url'], law=inquiry_item, phase=phase_item, source_link=response.url ) step_item.save() if created: log.msg(u"Created Step {}".format( green(u'[{}]'.format(step_item.title))),level=log.DEBUG) # Save statements for this step, if applicable if 'statements' in step['title']: for stmnt in step['title']['statements']: # Find the person pq = Person.objects.filter( source_link__endswith=stmnt['person_source_link']) if pq.exists() and pq.count() == 1: person_item = pq.first() st_data = { 'speech_type': stmnt['statement_type'], 'protocol_url': stmnt['protocol_link'] } st_item, st_created = Statement.objects.update_or_create( index=stmnt['index'], person=person_item, step=step_item, defaults=st_data) if st_created: log.msg(u"Created Statement by {} on {}".format( green( u'[{}]'.format(person_item.full_name)), step_item.date),level=log.DEBUG) else: log.msg(u"Updated Statement by {} on {}".format( green( u'[{}]'.format(person_item.full_name)), step_item.date),level=log.DEBUG) else: # We can't save statements if we can't find the # Person log.warning( red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}").format( green( u'[{}]'.format(stmnt['person_name'])), blue( "[{}]".format(stmnt['person_source_link'])), red("{}").format( "" if pq.exists() else " not"), "" if pq.count() > 1 else ", but {} persons matching found!".format( pq.count()) )) continue
def parse(self, response): self.SCRAPED_COUNTER += 1 source_link = response.url category = INQUIRY.CATEGORY.xt(response) parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.DESCRIPTION.xt(response) sender_objects = [] callback_requests = [] ts = GENERIC.TIMESTAMP.xt(response) # Inquiries from Bundesrat don't have an LLP => set None if("BR" in category): LLP = None else: LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts): self.logger.debug( green(u"[{} of {}] Skipping Inquiry, no changes: {}".format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title))) return # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.debug(u"Created category {}".format( green(u'[{}]'.format(category)))) # An inquiry can have multiple senders, but only a single recipient. # Try/catch in case person does not exist in the database. try: for sender_object in INQUIRY.SENDER.xt(response): sender_objects.append(Person.objects.get( parl_id=sender_object)) except: log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.SENDER.xt(response), parl_id, LLP))) return try: receiver_object = Person.objects.get( parl_id=INQUIRY.RECEIVER.xt(response)) except: log.warning(red(u'Receiver "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.RECEIVER.xt(response), parl_id, LLP))) return # Create or update Inquiry item inquiry_item, inquiry_created = Inquiry.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'receiver': receiver_object, 'ts': ts } ) if inquiry_created: inquiry_item.status = 'offen' # Attach foreign keys inquiry_item.keywords = self.parse_keywords(response) inquiry_item.documents = self.parse_docs(response) inquiry_item.category = cat inquiry_item.sender = sender_objects response.meta['inquiry_item'] = inquiry_item # Dringliche / Urgent inquiries have a different structure for steps # and history. This case distinction accomodates these different # structures. if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()): if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) else: response_link = self.parse_steps(response) if response_link: post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link), callback=self.parse_inquiry_response, dont_filter=True) post_req.meta['inquiry_item'] = inquiry_item callback_requests.append(post_req) # Save Inquiry item and log to terminal if created or updated. inquiry_item.save() if inquiry_created: logtext = u"[{} of {}] Created Inquiry {} with ID {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated Inquiry {} with ID {}, LLP {} @ {}" logtext = logtext.format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, cyan(title), cyan(u"{}".format(parl_id)), green(unicode(LLP)), blue(response.url), green(u"{}".format(inquiry_item.keywords)) ) log.msg(logtext, level=log.DEBUG if not self.SCRAPED_COUNTER%1000==0 else log.INFO) # log.msg(green("Open Callback requests: {}".format( # len(callback_requests))), level=log.INFO) return callback_requests