def print_answer(p): fromname = p.fromname try: flag = ms_flag_values[p[SAPMS].flag] except: flag = "0" try: opcode = str(ms_opcode_values[p[SAPMS].opcode]) except: opcode = str(p[SAPMS].opcode) try: opcode_err = str(ms_opcode_error_values[p[SAPMS].opcode_error]) except: opcode_err = 'None' if opcode_err == 'MSOP_OK': opcode_err = green(opcode_err) else: opcode_err = red(opcode_err, bold=True) if p.key != null_key: mskey_parse_print(p.key) key = p.key.encode('hex') else: key = "NULL" logger.debug("flag: " + cyan(flag) + " opcode:" + cyan(opcode) + \ " opcode_error: " + green(opcode_err) + " key: %s" % key)
def parse_inquiry_response(self, response): """ Callback function for parsing the inquiry responses """ inquiry_item = response.meta['inquiry_item'] source_link = response.url parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.RESPONSEDESCRIPTION.xt(response) LLP = inquiry_item.legislative_period category = INQUIRY.CATEGORY.xt(response) # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) try: sender_object = Person.objects.get( parl_id=INQUIRY.RESPONSESENDER.xt(response)) except: log.msg( red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}' .format(parl_id, LLP))) return # Create or update Inquiry item inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'sender': sender_object }) # Attach foreign Keys inquiryresponse_item.documents = self.parse_response_docs(response) inquiryresponse_item.category = cat # Save InquiryResponse object inquiryresponse_item.save() if inquiryresponse_created: logtext = u"[{} of {}] Created InquiryResponse {} with ID {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated InquiryResponse {} with ID {}, LLP {} @ {}" logtext = logtext.format(self.SCRAPED_COUNTER, self.TOTAL_COUNTER, cyan(title), cyan(u"{}".format(parl_id)), green(str(LLP)), blue(response.url)) log.msg(logtext, level=log.INFO) inquiry_item.response = inquiryresponse_item inquiry_item.save() return
def handle_answer(s, p): fromname = p.fromname try: flag = ms_flag_values[p[SAPMS].flag] except: flag = "0" try: opcode = str(ms_opcode_values[p[SAPMS].opcode]) except: opcode = str(p[SAPMS].opcode) try: opcode_err = str(ms_opcode_error_values[p[SAPMS].opcode_error]) except: opcode_err = 'None' if opcode_err == 'MSOP_OK': opcode_err = green(opcode_err) else: opcode_err = red(opcode_err, bold=True) if p.key != null_key: p.show() key = " key: " + yellow('NOT NULL', bold=True) print "[!] Out of order packets, reload this script." #s.close() #exit(0) else: key = "" print "flag: " + cyan(flag) + " opcode:" + cyan(opcode) + \ " opcode_error: " + green(opcode_err) + key # "idenfify request from the server? if key != "" and flag == 'MS_REQUEST' and opcode == '0': s.send(ms_adm_nilist(p, 1))
def handle_answer(s, p): fromname = p.fromname try: flag = ms_flag_values[p[SAPMS].flag] except: flag = "0" try: opcode = str(ms_opcode_values[p[SAPMS].opcode]) except: opcode = str(p[SAPMS].opcode) try: opcode_err = str(ms_opcode_error_values[p[SAPMS].opcode_error]) except: opcode_err = 'None' if opcode_err == 'MSOP_OK': opcode_err = green(opcode_err) else: opcode_err = red(opcode_err, bold=True) if p.key != null_key: key = " key: " + yellow('NOT NULL', bold=True) logger.error("[!] Out of order packets, reload this script.") #s.close() #exit(0) else: key = "" logger.info("flag: " + cyan(flag) + " opcode:" + cyan(opcode) + \ " opcode_error: " + green(opcode_err) + key)
def format_field_as_txt(field_name: str, field_doc: FieldDoc, second_column: int, field_prefix: str = '') -> str: output = '' field_name_length = \ INDENT + \ len(field_prefix + field_name + FIELD_SUFFIX) + \ INDENT field_name = \ ' ' * INDENT + \ ansicolor.cyan(field_prefix + field_name) + FIELD_SUFFIX + \ ' ' * INDENT description_indent = ' ' * second_column description = field_doc['description'] output += field_name + \ textwrap.fill( description, width=78, initial_indent=description_indent, subsequent_indent=description_indent )[field_name_length:] + '\n' if 'examples' in field_doc: output += description_indent + \ ansicolor.yellow('Examples:') + ' ' + \ str(field_doc['examples']) + '\n' output += '\n\n' return output
def parse_steps(self, response): """ Parse the Pre-Law's steps """ law_item = response.meta['law_item'] # Create phase if we don't have it yet phase_item, created = Phase.objects.get_or_create(title='default') if created: log.msg(u"Created Phase {}".format( green(u'[{}]'.format(phase_item.title)))) steps = PRELAW.STEPS.xt(response) if steps: log.msg(u"Creating {} steps".format( cyan(u'[{}]'.format(len(steps))))) # Create steps for step in steps: step_item, created = Step.objects.update_or_create( title=step['title'], sortkey=step['sortkey'], date=step['date'], protocol_url=step['protocol_url'], law=law_item, phase=phase_item, source_link=response.url) step_item.save()
def parse_steps(self, response): """ Parse the Pre-Law's steps """ law_item = response.meta['law_item'] # Create phase if we don't have it yet phase_item, created = Phase.objects.get_or_create( title='default') if created: log.msg(u"Created Phase {}".format( green(u'[{}]'.format(phase_item.title)))) steps = PRELAW.STEPS.xt(response) if steps: log.msg(u"Creating {} steps".format( cyan(u'[{}]'.format(len(steps))))) # Create steps for step in steps: step_item, created = Step.objects.update_or_create( title=step['title'], sortkey=step['sortkey'], date=step['date'], protocol_url=step['protocol_url'], law=law_item, phase=phase_item, source_link=response.url ) step_item.save()
def parse(self, response): # Extract fields title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) status = LAW.STATUS.xt(response) LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) # Extract foreign keys category = LAW.CATEGORY.xt(response) description = LAW.DESCRIPTION.xt(response) # Create category if we don't have it yet cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) # Create and save Law law_data = { 'title': title, 'status': status, 'description': description } law_item, law_created = Law.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, source_link=response.url, defaults=law_data) # Attach foreign keys law_item.keywords = self.parse_keywords(response) law_item.category = cat law_item.documents = self.parse_docs(response) law_item.save() # Log our progress if law_created: logtext = u"Created {} with id {}, LLP {} @ {}" else: logtext = u"Updated {} with id {}, LLP {} @ {}" logtext = logtext.format( red(title), cyan(u"[{}]".format(parl_id)), green(str(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) response.meta['law_item'] = law_item # is the tab 'Parlamentarisches Verfahren available?' if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'): self.parse_pre_parliament_steps(response)
def inform(msg, minor=False, major=False): if major: ansicolor.write_out(ansicolor.yellow('>>> %s\n' % msg)) elif minor: ansicolor.write_out(ansicolor.cyan('-> %s\n' % msg)) else: ansicolor.write_out(ansicolor.green('> %s\n' % msg))
def write_progress(self, rate=None, prestart=None, wait=None, complete=False, error=None): # compute string lengths action = self.action.rjust(self.actionwidth) if error: rate = error elif prestart: rate = "starting" elif wait: rate = ("%s" % self.retry_wait) + "s..." elif complete: rate = "done" else: rate = "%s/s" % self.format_size(rate) rate = rate.ljust(self.ratewidth) url = self.url_fmt if self.totalsize: size = self.format_size(self.totalsize) elif self.download_size: size = self.format_size(self.download_size) else: size = "????? B" size = (" %s" % size).ljust(self.sizewidth) # add formatting if error: rate = ansicolor.red(rate) elif prestart or wait: rate = ansicolor.cyan(rate) elif complete: rate = ansicolor.green(rate) else: rate = ansicolor.yellow(rate) # draw progress bar if not (error or prestart or complete) and self.totalsize: c = int(self.urlwidth * self.download_size / self.totalsize) url = ansicolor.wrap_string(self.url_fmt, c, None, reverse=True) if not self.totalsize: size = ansicolor.yellow(size) line = "%s :: %s " % (action, rate) term = (os.environ.get("DEBUG_FETCH") and "\n") or "\r" if error or complete: term = "\n" ioutils.write_err("%s%s%s%s" % (line, url, size, term)) # log download if error: self.log_url(error, error=True) elif complete: self.log_url("done")
def zoom_print_facets(result): facets_k = result.keys() if not facets_k: return #count = str(result['device'][0]['count']) #facets_k.remove("device") #print "Total: %s" % red(count, bold=True) for k in facets_k: print "- %s" % green(k, bold=True) for e in result[k]: count = str(e['count']) name = e['name'] if isinstance(name, int): name = str(name) print count.ljust(9) + cyan(name).ljust(20) print return
def print_debug(self): """ Collects and prints a structured debug message """ message = """ {bar} {title} Scraping LLPs: {llps} Base URL: {url} {bar} """.format(bar=cyan( '############################################################'), title=red(self.title), llps=self.LLP or "Not applicable", url=self.BASE_URL) print message
def print_debug(self): """ Collects and prints a structured debug message """ message = """ {bar} {title} Scraping LLPs: {llps} Base URL: {url} {bar} """.format( bar=cyan( '############################################################'), title=red(self.title), llps=self.LLP or "Not applicable", url=self.BASE_URL ) print message
def print_debug(self): """ Collects and prints a structured debug message """ message = """ {bar} {title} Scraping LLPs: {llps} Ignoring Timestamps: {IGNORE_TIMESTAMP} Base URL: {url} {bar} """.format( bar=cyan( '############################################################' ), title=red(self.title), llps=self.LLP or "Not applicable", url=self.BASE_URL, IGNORE_TIMESTAMP=self.IGNORE_TIMESTAMP, ) print message
def parse(self, response): # rss = feedparser.parse(response.url) persons = PERSON.LIST.xt(response) callback_requests = [] # which llp are we in? urloptions = response.url.split('?')[1] llp_roman = [ opt.split('=')[1] for opt in urloptions.split('&') if opt.split('=')[0] == 'GP' ] llp_item = LegislativePeriod.objects.get(roman_numeral=llp_roman[0]) # function string function = [ opt.split('=')[1] for opt in urloptions.split('&') if opt.split('=')[0] == 'NRBR' ] function_str = self.RSS_TO_FUNCTION[function[0]] function_item, f_created = Function.objects.get_or_create( title=function_str) self.logger.info(u"Scraping {} persons for LLP {}".format( len(persons), llp_roman)) # Iterate all persons for p in persons: # Extract basic data parl_id = p['source_link'].split('/')[-2] p['source_link'] = "{}{}".format(BASE_HOST, p['source_link']) changed = False # Create or update simple person's item try: person_data = { 'reversed_name': p['reversed_name'], 'source_link': p['source_link'] } person_item, created_person = Person.objects.update_or_create( parl_id=parl_id, defaults=person_data) except Exception as e: self.logger.warning("Error saving Person {}: {}".format( green(u'[{}]'.format(p['reversed_name'])), e)) continue if created_person: self.logger.info(u"Created Person {}".format( green(u'[{}]'.format(p['reversed_name'])))) else: self.logger.info(u"Updated Person {}".format( green(u"[{}]".format(p['reversed_name'])))) for mandate in p['mandates']: party_item = self.get_party_item(mandate) state_item = self.get_state_item(p['electoral_state']) # Create and append mandate try: mandate_item, m_created = Mandate.objects.update_or_create( function=function_item, legislative_period=llp_item, party=party_item, state=state_item) except: self.logger.info( red(u"Error saving Mandate {} ({})".format( function_item, party_item))) import ipdb ipdb.set_trace() if mandate_item not in person_item.mandates.all(): changed = True person_item.mandates.add(mandate_item) if changed: # In case we added/modified a mandate now, latest_mandate_item = person_item.get_latest_mandate() person_item.latest_mandate = latest_mandate_item self.logger.info( cyan(u"Latest mandate for {} is now {}".format( person_item, latest_mandate_item))) person_item.save() # First time we encounter a person, we scan her detail page too if not parl_id in self.persons_scraped: # Create Detail Page request req = scrapy.Request(p['source_link'], callback=self.parse_person_detail) req.meta['person'] = { 'reversed_name': p['reversed_name'], 'source_link': p['source_link'], 'parl_id': parl_id } callback_requests.append(req) self.persons_scraped.append(parl_id) return callback_requests
def parse(self, response): source_link = response.url category = INQUIRY.CATEGORY.xt(response) parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.DESCRIPTION.xt(response) sender_objects = [] callback_requests = [] ts = GENERIC.TIMESTAMP.xt(response) # Inquiries from Bundesrat don't have an LLP => set None if("BR" in category): LLP = None else: LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) if not self.has_changes(parl_id, LLP, response.url, ts): self.logger.info( green(u"Skipping Inquiry, no changes: {}".format( title))) return # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) # An inquiry can have multiple senders, but only a single recipient. # Try/catch in case person does not exist in the database. try: for sender_object in INQUIRY.SENDER.xt(response): sender_objects.append(Person.objects.get( parl_id=sender_object)) except: log.msg(red(u'Sender was not found in database, skipping Inquiry {} in LLP {}'.format( parl_id, LLP))) return try: receiver_object = Person.objects.get( parl_id=INQUIRY.RECEIVER.xt(response)) except: log.msg(red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}'.format( parl_id, LLP))) return # Create or update Inquiry item inquiry_item, inquiry_created = Inquiry.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'receiver': receiver_object, 'ts': ts } ) # Attach foreign keys inquiry_item.keywords = self.parse_keywords(response) inquiry_item.documents = self.parse_docs(response) inquiry_item.category = cat inquiry_item.sender = sender_objects response.meta['inquiry_item'] = inquiry_item # Dringliche / Urgent inquiries have a different structure for steps # and history. This case distinction accomodates these different # structures. if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()): if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) else: response_link = self.parse_steps(response) if response_link: post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link), callback=self.parse_inquiry_response, dont_filter=True) post_req.meta['inquiry_item'] = inquiry_item callback_requests.append(post_req) # Save Inquiry item and log to terminal if created or updated. inquiry_item.save() if inquiry_created: logtext = u"Created Inquiry {} with ID {}, LLP {} @ {}" else: logtext = u"Updated Inquiry {} with ID {}, LLP {} @ {}" logtext = logtext.format( cyan(title), cyan(u"{}".format(parl_id)), green(str(LLP)), blue(response.url), green(u"{}".format(inquiry_item.keywords)) ) log.msg(logtext, level=log.INFO) log.msg(green("Open Callback requests: {}".format( len(callback_requests))), level=log.INFO) return callback_requests
def parse_inquiry_response(self, response): """ Callback function for parsing the inquiry responses """ inquiry_item = response.meta['inquiry_item'] source_link = response.url parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.RESPONSEDESCRIPTION.xt(response) LLP = inquiry_item.legislative_period category = INQUIRY.CATEGORY.xt(response) # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) try: sender_object = Person.objects.get( parl_id=INQUIRY.RESPONSESENDER.xt(response)) except: log.msg(red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}'.format( parl_id, LLP))) return # Create or update Inquiry item inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'sender': sender_object } ) # Attach foreign Keys inquiryresponse_item.documents = self.parse_docs(response) inquiryresponse_item.category = cat # Save InquiryResponse object inquiryresponse_item.save() if inquiryresponse_created: logtext = u"Created InquiryResponse {} with ID {}, LLP {} @ {}" else: logtext = u"Updated InquiryResponse {} with ID {}, LLP {} @ {}" logtext = logtext.format( cyan(title), cyan(u"{}".format(parl_id)), green(str(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) inquiry_item.response = inquiryresponse_item inquiry_item.save() return
def parse(self, response): # rss = feedparser.parse(response.url) persons = PERSON.LIST.xt(response) callback_requests = [] # which llp are we in? urloptions = response.url.split('?')[1] llp_roman = [opt.split('=')[1] for opt in urloptions.split('&') if opt.split('=')[0] == 'GP'] llp_item = LegislativePeriod.objects.get(roman_numeral=llp_roman[0]) # function string function = [opt.split('=')[1] for opt in urloptions.split('&') if opt.split('=')[0] == 'NRBR'] function_str = self.RSS_TO_FUNCTION[function[0]] function_item, f_created = Function.objects.get_or_create( title=function_str) self.logger.info( "Scraping {} persons for LLP {}".format(len(persons), llp_roman)) # Iterate all persons for p in persons: # Extract basic data parl_id = p['source_link'].split('/')[-2] p['source_link'] = "{}{}".format(BASE_HOST, p['source_link']) # Create or update simple person's item person_data = { 'reversed_name': p['reversed_name'] } person_item, created_person = Person.objects.update_or_create( source_link=p['source_link'], parl_id=parl_id, defaults=person_data ) if created_person: self.logger.info(u"Created Person {}".format( green(u'[{}]'.format(p['reversed_name'])))) else: self.logger.info(u"Updated Person {}".format( green(u"[{}]".format(p['reversed_name'])) )) for mandate in p['mandates']: party_item = self.get_party_item(mandate) state_item = self.get_state_item(p['electoral_state']) # Create and append mandate try: mandate_item, m_created = Mandate.objects.update_or_create( function=function_item, legislative_period=llp_item, party=party_item, state=state_item) except: self.logger.info( red("Error saving Mandate {} ({})".format(function_item, party_item))) import ipdb ipdb.set_trace() person_item.mandates.add(mandate_item) # Do a save to update the db models person_item.save() # In case we added/modified a mandate now, if p['mandates']: latest_mandate_item = person_item.get_latest_mandate() person_item.latest_mandate = latest_mandate_item self.logger.info( cyan("Latest mandate for {} is now {}".format(person_item, latest_mandate_item))) person_item.save() # First time we encounter a person, we scan her detail page too if not parl_id in self.persons_scraped: # Create Detail Page request req = scrapy.Request(p['source_link'], callback=self.parse_person_detail) req.meta['person'] = { 'reversed_name': p['reversed_name'], 'source_link': p['source_link'], 'parl_id': parl_id } callback_requests.append(req) self.persons_scraped.append(parl_id) return callback_requests
class InquiriesSpider(BaseSpider): BASE_URL = "{}/{}".format(BASE_HOST, "PAKT/JMAB/filter.psp") URLOPTIONS = { 'view': 'RSS', 'jsMode': 'RSS', 'xdocumentUri': '/PAKT/JMAB/index.shtml', 'NRBR': 'NR', 'anwenden': 'Anwenden', 'JMAB': 'J_JPR_M', 'VHG2': 'ALLE', 'SUCH': '', 'listeId': '105', 'FBEZ': 'FP_005' } name = "inquiries" inquiries_scraped = [] def __init__(self, **kw): super(InquiriesSpider, self).__init__(**kw) if 'llp' in kw: try: self.LLP = [int(kw['llp'])] except: pass self.cookies_seen = set() self.idlist = {} self.url_override = kw.get('url', None) def start_requests(self): """ Returns a list of URLs to scrape """ # This predefined list of URLs is chosen to include all types of # inquiries possible in the Austrian parliament in order to provide a # suitable testing surface for new functions. # urls = ["https://www.parlament.gv.at/PAKT/VHG/XXV/JPR/JPR_00019/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/JPR/JPR_00016/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/J/J_06954/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/M/M_00178/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/JEU/JEU_00003/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/J/J_06758/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_03089/index.shtml", # "https://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_03091/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_01155/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_06110/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_06651/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_04024/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_04025/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XX/M/M_00178/index.shtml"] urls = [] if not self.url_override else [self.url_override] if self.LLP and not self.url_override: for i in self.LLP: for nrbr in ['NR', 'BR']: roman_numeral = roman.toRoman(i) options = self.URLOPTIONS.copy() options['GP'] = roman_numeral options['NRBR'] = nrbr url_options = urlencode(options) url_llp = "{}?{}".format(self.BASE_URL, url_options) rss = feedparser.parse(url_llp) self.logger.info("GP {}: {} inquiries from {}".format( roman_numeral, len(rss['entries']), nrbr) ) urls = urls + [entry['link'] for entry in rss['entries']] self.TOTAL_COUNTER = len(urls) for url in urls: yield self.make_requests_from_url(url) def parse(self, response): self.SCRAPED_COUNTER += 1 source_link = response.url category = INQUIRY.CATEGORY.xt(response) parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.DESCRIPTION.xt(response) sender_objects = [] callback_requests = [] ts = GENERIC.TIMESTAMP.xt(response) # Inquiries from Bundesrat don't have an LLP => set None if("BR" in category): LLP = None else: LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts): self.logger.debug( green(u"[{} of {}] Skipping Inquiry, no changes: {}".format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title))) return # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.debug(u"Created category {}".format( green(u'[{}]'.format(category)))) # An inquiry can have multiple senders, but only a single recipient. # Try/catch in case person does not exist in the database. try: for sender_object in INQUIRY.SENDER.xt(response): sender_objects.append(Person.objects.get( parl_id=sender_object)) except: log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.SENDER.xt(response), parl_id, LLP))) return try: receiver_object = Person.objects.get( parl_id=INQUIRY.RECEIVER.xt(response)) except: log.warning(red(u'Receiver "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.RECEIVER.xt(response), parl_id, LLP))) return # Create or update Inquiry item inquiry_item, inquiry_created = Inquiry.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'receiver': receiver_object, 'ts': ts } ) if inquiry_created: inquiry_item.status = 'offen' # Attach foreign keys inquiry_item.keywords = self.parse_keywords(response) inquiry_item.documents = self.parse_docs(response) inquiry_item.category = cat inquiry_item.sender = sender_objects response.meta['inquiry_item'] = inquiry_item # Dringliche / Urgent inquiries have a different structure for steps # and history. This case distinction accomodates these different # structures. if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()): if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) else: response_link = self.parse_steps(response) if response_link: post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link), callback=self.parse_inquiry_response, dont_filter=True) post_req.meta['inquiry_item'] = inquiry_item callback_requests.append(post_req) # Save Inquiry item and log to terminal if created or updated. inquiry_item.save() if inquiry_created: logtext = u"[{} of {}] Created Inquiry {} with ID {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated Inquiry {} with ID {}, LLP {} @ {}" logtext = logtext.format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, cyan(title), cyan(u"{}".format(parl_id)), green(unicode(LLP)), blue(response.url), green(u"{}".format(inquiry_item.keywords)) ) log.msg(logtext, level=log.DEBUG if not self.SCRAPED_COUNTER%1000==0 else log.INFO) # log.msg(green("Open Callback requests: {}".format( # len(callback_requests))), level=log.INFO) return callback_requests def has_changes(self, parl_id, legislative_period, source_link, ts): if not Inquiry.objects.filter( parl_id=parl_id, legislative_period=legislative_period, source_link=source_link ).exists(): return True ts = ts.replace(tzinfo=pytz.utc) if Inquiry.objects.get( parl_id=parl_id, legislative_period=legislative_period, source_link=source_link).ts != ts: return True return False def parse_keywords(self, response): keywords = INQUIRY.KEYWORDS.xt(response) # Create all keywords we don't yet have in the DB keyword_items = [] for keyword in keywords: kw, created = Keyword.objects.get_or_create(title=keyword) if created: log.msg(u"Created keyword {}".format( green(u'[{}]'.format(keyword))),level=log.DEBUG) keyword_items.append(kw) return keyword_items def parse_docs(self, response): docs = INQUIRY.DOCS.xt(response) # Create all docs we don't yet have in the DB doc_items = [] for document in docs: doc, created = Document.objects.get_or_create( title=document['title'], html_link=document['html_url'], pdf_link=document['pdf_url'], stripped_html=None ) doc_items.append(doc) return doc_items def parse_response_docs(self, response): docs = INQUIRY.RESPONSEDOCS.xt(response) # Create all docs we don't yet have in the DB doc_items = [] for document in docs: doc, created = Document.objects.get_or_create( title=document['title'], html_link=document['html_url'], pdf_link=document['pdf_url'], stripped_html=None ) doc_items.append(doc) return doc_items def parse_steps(self, response): """ Callback function to parse the single-page history for normal inquiries """ response_link = [] inquiry_item = response.meta['inquiry_item'] # Get or created a default-phase for inquiries, because there are no phases in # simple inquiries. phase_item, created = Phase.objects.get_or_create( title='default_inqu') if created: log.msg(u"Created Phase {}".format( green(u'[{}]'.format(phase_item.title))),level=log.DEBUG) steps = INQUIRY.STEPS.xt(response) for step in steps: if "Schriftliche Beantwortung" in step["title"]: response_link = INQUIRY.RESPONSE_LINK.xt(response) for step in steps: step_item, created = Step.objects.update_or_create( title=step['title'], sortkey=step['sortkey'], date=step['date'], protocol_url=step['protocol_url'], law=inquiry_item, phase=phase_item, source_link=response.url ) step_item.save() if response_link: return response_link else: return def parse_parliament_steps(self, response): """ Callback function to parse the additional 'Parlamentarisches Verfahren' page. """ inquiry_item = response.meta['inquiry_item'] phases = INQUIRY.PHASES.xt(response) for phase in phases: # Create phase if we don't have it yet phase_item, created = Phase.objects.get_or_create( title=phase['title']) if created: log.msg(u"Created Phase {}".format( green(u'[{}]'.format(phase_item.title))),level=log.DEBUG) # Create steps for step in phase['steps']: step_item, created = Step.objects.update_or_create( title=step['title']['text'], sortkey=step['sortkey'], date=step['date'], protocol_url=step['protocol_url'], law=inquiry_item, phase=phase_item, source_link=response.url ) step_item.save() if created: log.msg(u"Created Step {}".format( green(u'[{}]'.format(step_item.title))),level=log.DEBUG) # Save statements for this step, if applicable if 'statements' in step['title']: for stmnt in step['title']['statements']: # Find the person pq = Person.objects.filter( source_link__endswith=stmnt['person_source_link']) if pq.exists() and pq.count() == 1: person_item = pq.first() st_data = { 'speech_type': stmnt['statement_type'], 'protocol_url': stmnt['protocol_link'] } st_item, st_created = Statement.objects.update_or_create( index=stmnt['index'], person=person_item, step=step_item, defaults=st_data) if st_created: log.msg(u"Created Statement by {} on {}".format( green( u'[{}]'.format(person_item.full_name)), step_item.date),level=log.DEBUG) else: log.msg(u"Updated Statement by {} on {}".format( green( u'[{}]'.format(person_item.full_name)), step_item.date),level=log.DEBUG) else: # We can't save statements if we can't find the # Person log.warning( red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}").format( green( u'[{}]'.format(stmnt['person_name'])), blue( "[{}]".format(stmnt['person_source_link'])), red("{}").format( "" if pq.exists() else " not"), "" if pq.count() > 1 else ", but {} persons matching found!".format( pq.count()) )) continue def parse_inquiry_response(self, response): """ Callback function for parsing the inquiry responses """ inquiry_item = response.meta.get('inquiry_item',None) # allow testing single urls for parsing errors source_link = response.url parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.RESPONSEDESCRIPTION.xt(response) LLP = inquiry_item.legislative_period if inquiry_item else None category = INQUIRY.CATEGORY.xt(response) # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category))),level=log.DEBUG) try: sender_object = Person.objects.get( parl_id=INQUIRY.RESPONSESENDER.xt(response)) except Exception, e: log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.RESPONSESENDER.xt(response), parl_id, LLP))) return if not inquiry_item: print locals() return # allow testing single urls for parsing errors # Create or update Inquiry item inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'sender': sender_object } ) # Attach foreign Keys inquiryresponse_item.documents = self.parse_response_docs(response) inquiryresponse_item.category = cat # Save InquiryResponse object inquiryresponse_item.save() if inquiryresponse_created: logtext = u"[{} of {}] Created InquiryResponse {} with ID {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated InquiryResponse {} with ID {}, LLP {} @ {}" logtext = logtext.format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, cyan(title), cyan(u"{}".format(parl_id)), green(unicode(LLP)), blue(response.url) ) log.msg(logtext, level=log.DEBUG if self.SCRAPED_COUNTER!=0 else log.INFO) inquiry_item.response = inquiryresponse_item inquiry_item.status = 'response_received' inquiry_item.save() return
from ansicolor import cyan from ansicolor import green from ansicolor import red from ansicolor import white print("Let's try two colors: %s and %s!" % (red("red"), green("green"))) print("It's also easy to produce text in %s," % (red("bold", bold=True))) print("...%s," % (green("reverse", reverse=True))) print("...and %s." % (cyan("bold and reverse", bold=True, reverse=True)))
def parse_list(self, response): # rss = feedparser.parse(response.url) persons = PERSON.LIST.xt(response) logger.info(u"parsing list: {}, {} persons".format( green(u'[{}]'.format(response.url)), len(persons))) callback_requests = [] # which llp are we in? urloptions = response.url.split('?')[1] opts = dict(urlparse.parse_qsl(urloptions)) llp_roman = opts['GP'] llp_item = LegislativePeriod.objects.get(roman_numeral=llp_roman) # function string function = opts['NRBR'] function_str = self.RSS_TO_FUNCTION[function] function_item, f_created = Function.objects.get_or_create( title=function_str) logger.info( u"Scraping {} persons for LLP {}, {}".format(len(persons), llp_roman, function)) # Iterate all persons for p in persons: # Extract basic data parl_id = p['source_link'].split('/')[-2] p['source_link'] = "{}{}".format(BASE_HOST, p['source_link']) changed = False # Create or update simple person's item try: person_data = { 'reversed_name': p['reversed_name'], 'source_link': p['source_link'] } person_item, created_person = Person.objects.update_or_create( parl_id=parl_id, defaults=person_data) except Exception as e: logger.warning("Error saving Person {}: {}".format( green(u'[{}]'.format(p['reversed_name'])), e )) continue if created_person: logger.debug(u"Created Person {}".format( green(u'[{}]'.format(p['reversed_name'])))) else: logger.debug(u"Updated Person {}".format( green(u"[{}]".format(p['reversed_name'])) )) for mandate in p['mandates']: party_item = self.get_party_item(mandate) state_item = self.get_state_item(p['electoral_state']) # Create and append mandate try: mandate_items = person_item.mandate_set.filter( Q(function__title__contains='Nationalrat') if 'Nationalrat' in function_item.title else Q(function__title__contains='Bundesrat') ).filter( legislative_period=llp_item, party=party_item ) if not mandate_items: mandate_items = [person_item.mandate_set.create( function=function_item, legislative_period=llp_item, party=party_item, state=state_item )] mandate_item = mandate_items[0] except Exception, e: logger.warning( red(u"Error saving Mandate {} ({}) / Person {}".format(function_item, party_item, person_item.pk))) import ipdb ipdb.set_trace() if changed: # In case we added/modified a mandate now, latest_mandate_item = person_item.get_latest_mandate() person_item.latest_mandate = latest_mandate_item logger.debug( cyan(u"Latest mandate for {} is now {}".format(person_item, latest_mandate_item))) person_item.save() # First time we encounter a person, we scan her detail page too if not parl_id in self.persons_scraped: # Create Detail Page request req = scrapy.Request(p['source_link'], callback=self.parse_person_detail) req.meta['person'] = { 'reversed_name': p['reversed_name'], 'source_link': p['source_link'], 'parl_id': parl_id } callback_requests.append(req) self.persons_scraped.append(parl_id)
def parse(self, response): self.SCRAPED_COUNTER += 1 source_link = response.url category = INQUIRY.CATEGORY.xt(response) parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.DESCRIPTION.xt(response) sender_objects = [] callback_requests = [] ts = GENERIC.TIMESTAMP.xt(response) # Inquiries from Bundesrat don't have an LLP => set None if("BR" in category): LLP = None else: LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts): self.logger.debug( green(u"[{} of {}] Skipping Inquiry, no changes: {}".format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title))) return # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.debug(u"Created category {}".format( green(u'[{}]'.format(category)))) # An inquiry can have multiple senders, but only a single recipient. # Try/catch in case person does not exist in the database. try: for sender_object in INQUIRY.SENDER.xt(response): sender_objects.append(Person.objects.get( parl_id=sender_object)) except: log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.SENDER.xt(response), parl_id, LLP))) return try: receiver_object = Person.objects.get( parl_id=INQUIRY.RECEIVER.xt(response)) except: log.warning(red(u'Receiver "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.RECEIVER.xt(response), parl_id, LLP))) return # Create or update Inquiry item inquiry_item, inquiry_created = Inquiry.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'receiver': receiver_object, 'ts': ts } ) if inquiry_created: inquiry_item.status = 'offen' # Attach foreign keys inquiry_item.keywords = self.parse_keywords(response) inquiry_item.documents = self.parse_docs(response) inquiry_item.category = cat inquiry_item.sender = sender_objects response.meta['inquiry_item'] = inquiry_item # Dringliche / Urgent inquiries have a different structure for steps # and history. This case distinction accomodates these different # structures. if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()): if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) else: response_link = self.parse_steps(response) if response_link: post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link), callback=self.parse_inquiry_response, dont_filter=True) post_req.meta['inquiry_item'] = inquiry_item callback_requests.append(post_req) # Save Inquiry item and log to terminal if created or updated. inquiry_item.save() if inquiry_created: logtext = u"[{} of {}] Created Inquiry {} with ID {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated Inquiry {} with ID {}, LLP {} @ {}" logtext = logtext.format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, cyan(title), cyan(u"{}".format(parl_id)), green(unicode(LLP)), blue(response.url), green(u"{}".format(inquiry_item.keywords)) ) log.msg(logtext, level=log.DEBUG if not self.SCRAPED_COUNTER%1000==0 else log.INFO) # log.msg(green("Open Callback requests: {}".format( # len(callback_requests))), level=log.INFO) return callback_requests
def parse(self, response): # rss = feedparser.parse(response.url) persons = PERSON.LIST.xt(response) callback_requests = [] # which llp are we in? urloptions = response.url.split("?")[1] llp_roman = [opt.split("=")[1] for opt in urloptions.split("&") if opt.split("=")[0] == "GP"] llp_item = LegislativePeriod.objects.get(roman_numeral=llp_roman[0]) # function string function = [opt.split("=")[1] for opt in urloptions.split("&") if opt.split("=")[0] == "NRBR"] function_str = self.RSS_TO_FUNCTION[function[0]] function_item, f_created = Function.objects.get_or_create(title=function_str) self.logger.info(u"Scraping {} persons for LLP {}".format(len(persons), llp_roman)) # Iterate all persons for p in persons: # Extract basic data parl_id = p["source_link"].split("/")[-2] p["source_link"] = "{}{}".format(BASE_HOST, p["source_link"]) changed = False # Create or update simple person's item person_item, created_person = Person.objects.update_or_create( source_link=p["source_link"], parl_id=parl_id, reversed_name=p["reversed_name"] ) if created_person: self.logger.info(u"Created Person {}".format(green(u"[{}]".format(p["reversed_name"])))) else: self.logger.info(u"Updated Person {}".format(green(u"[{}]".format(p["reversed_name"])))) for mandate in p["mandates"]: party_item = self.get_party_item(mandate) state_item = self.get_state_item(p["electoral_state"]) # Create and append mandate try: mandate_item, m_created = Mandate.objects.update_or_create( function=function_item, legislative_period=llp_item, party=party_item, state=state_item ) except: self.logger.info(red(u"Error saving Mandate {} ({})".format(function_item, party_item))) import ipdb ipdb.set_trace() if mandate_item not in person_item.mandates.all(): changed = True person_item.mandates.add(mandate_item) if changed: # In case we added/modified a mandate now, latest_mandate_item = person_item.get_latest_mandate() person_item.latest_mandate = latest_mandate_item self.logger.info(cyan(u"Latest mandate for {} is now {}".format(person_item, latest_mandate_item))) person_item.save() # First time we encounter a person, we scan her detail page too if not parl_id in self.persons_scraped: # Create Detail Page request req = scrapy.Request(p["source_link"], callback=self.parse_person_detail) req.meta["person"] = { "reversed_name": p["reversed_name"], "source_link": p["source_link"], "parl_id": parl_id, } callback_requests.append(req) self.persons_scraped.append(parl_id) return callback_requests
def parse(self, response): self.SCRAPED_COUNTER += 1 LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) # Extract fields ts = GENERIC.TIMESTAMP.xt(response) title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) status = LAW.STATUS.xt(response) if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts): self.logger.info( green(u"[{} of {}] Skipping Law, no changes: {}".format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title))) return # Extract foreign keys category = LAW.CATEGORY.xt(response) description = LAW.DESCRIPTION.xt(response) # Create category if we don't have it yet cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) # Create and save Law law_data = { 'title': title, 'status': status, 'description': description, 'ts': ts, 'source_link': response.url, } law_item, law_created = Law.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults=law_data) # Attach foreign keys law_item.keywords = self.parse_keywords(response) law_item.category = cat law_item.documents = self.parse_docs(response) law_item.save() # Log our progress if law_created: logtext = u"[{} of {}] Created {} with id {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated {} with id {}, LLP {} @ {}" logtext = logtext.format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, red(title), cyan(u"[{}]".format(parl_id)), green(unicode(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) response.meta['law_item'] = law_item # is the tab 'Parlamentarisches Verfahren available?' if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'): self.parse_pre_parliament_steps(response)
def parse(self, response): self.SCRAPED_COUNTER += 1 LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) # Extract fields ts = GENERIC.TIMESTAMP.xt(response) title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) status = LAW.STATUS.xt(response) if not self.IGNORE_TIMESTAMP and not self.has_changes( parl_id, LLP, response.url, ts): self.logger.info( green(u"[{} of {}] Skipping Law, no changes: {}".format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title))) return # Extract foreign keys category = LAW.CATEGORY.xt(response) description = LAW.DESCRIPTION.xt(response) # Create category if we don't have it yet cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) # Create and save Law law_data = { 'title': title, 'status': status, 'description': description, 'ts': ts, 'source_link': response.url, } law_item, law_created = Law.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults=law_data) # Attach foreign keys law_item.keywords = self.parse_keywords(response) law_item.category = cat law_item.documents = self.parse_docs(response) law_item.save() # Log our progress if law_created: logtext = u"[{} of {}] Created {} with id {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated {} with id {}, LLP {} @ {}" logtext = logtext.format(self.SCRAPED_COUNTER, self.TOTAL_COUNTER, red(title), cyan(u"[{}]".format(parl_id)), green(unicode(LLP)), blue(response.url)) log.msg(logtext, level=log.INFO) response.meta['law_item'] = law_item # is the tab 'Parlamentarisches Verfahren available?' if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'): self.parse_pre_parliament_steps(response)