def log_context_attributes(ctx: Context) -> None: """Get the context attributes stored on context Args: ctx: The behave context """ LOGGER.debug("Current attributes saved to context:") if ctx.config.logging_level == 10: # For each layer of the context log the layer and make a table of attributes print("\n") for layer in ctx._stack: print( ansicolor.blue( "-------------------------------------------------------------------------" )) print(ansicolor.blue(f"CONTEXT LAYER: {layer['@layer']}")) print( ansicolor.blue( "-------------------------------------------------------------------------" )) # pretty print out each key value pair in a table for key in layer: # Don't log standard attributes that are none if layer[key] is not None: print("%s %s| %r" % (key, " " * (30 - len(key)), layer[key])) print( ansicolor.blue( "-------------------------------------------------------------------------" )) print("\n")
def restoreDump(args): #Setup raw YakDB connection conn = YakDB.Connection() conn.connect(args.req_endpoint) #Filenames to dump to filenames = __getDumpFilenames(args) #NOTE: Partial & incremental restore is supported #Restory every table if the corresponding file exists if not args.no_documents: if not os.path.isfile(filenames[0]): print (red("Can't find document table file " + filenames[0], bold=True)) else: #It's a regular file print (blue("Restoring document table from " + filenames[0], bold=True)) importYDFDump(conn, filenames[0], 1) if not args.no_entities: if not os.path.isfile(filenames[1]): print (red("Can't find entity table file " + filenames[1], bold=True)) else: #It's a regular file print (blue("Restoring entity table from " + filenames[1], bold=True)) importYDFDump(conn, filenames[1], 2) if not args.no_document_idx: if not os.path.isfile(filenames[2]): print (red("Can't find document index table file " + filenames[2], bold=True)) else: #It's a regular file print (blue("Restoring document index table from " + filenames[2], bold=True)) importYDFDump(conn, filenames[2], 3) if not args.no_entity_idx: if not os.path.isfile(filenames[3]): print (red("Can't find document index table file " + filenames[3], bold=True)) else: #It's a regular file print (blue("Restoring entity index table from " + filenames[3], bold=True)) importYDFDump(conn, filenames[3], 4)
def truncate(args): "Delete data from one or more tables" #Check if the user is sure if not args.yes_i_know_what_i_am_doing: print (red("This will delete all your Translatron data. If you are sure, please use --yes-i-know-what-i-am-doing ", bold=True)) return #Setup raw YakDB connection conn = YakDB.Connection() conn.connect(args.req_endpoint) # #Restory every table if the corresponding file exists if not args.no_documents: print (blue("Truncating document table... ", bold=True)) if args.hard: conn.truncateTable(1) else: conn.deleteRange(1, None, None, None) if not args.no_entities: print (blue("Truncating entity table... ", bold=True)) if args.hard: conn.truncateTable(2) else: conn.deleteRange(2, None, None, None) if not args.no_document_idx: print (blue("Truncating document index table... ", bold=True)) if args.hard: conn.truncateTable(3) else: conn.deleteRange(3, None, None, None) if not args.no_entity_idx: print (blue("Truncating entity index table... ", bold=True)) if args.hard: conn.truncateTable(4) else: conn.deleteRange(4, None, None, None)
def checkConnection(args): import YakDB #Check request/reply connection print (blue("Checking request/reply connection...", bold=True)) conn = YakDB.Connection() conn.connect(args.req_endpoint) #Request server info print((conn.serverInfo()).decode("utf-8")) print(green("REQ/REP connection attempt successful")) #Check push/pull connection print (blue("Checking push/pull connection...", bold=True)) print(green("PUSH/PULL connection attempt successful")) conn = YakDB.Connection() conn.usePushMode() conn.connect(args.push_endpoint)
def startWebsocketServer(): print(blue("Websocket server starting up...")) try: import asyncio except ImportError: ## Trollius >= 0.3 was renamed import trollius as asyncio #Asyncio only setups an event loop in the main thread, else we need to if threading.current_thread().name != 'MainThread': loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) factory = WebSocketServerFactory("ws://0.0.0.0:9000", debug = False) factory.protocol = TranslatronProtocol loop = asyncio.get_event_loop() server = loop.create_server(factory, '0.0.0.0', 9000) server = loop.run_until_complete(server) try: loop.run_forever() except KeyboardInterrupt: pass finally: server.close() loop.close()
def startWebsocketServer(): print(blue("Websocket server starting up...")) try: import asyncio except ImportError: ## Trollius >= 0.3 was renamed import trollius as asyncio #Asyncio only setups an event loop in the main thread, else we need to if threading.current_thread().name != 'MainThread': loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) factory = WebSocketServerFactory("ws://0.0.0.0:9000", debug=False) factory.protocol = TranslatronProtocol loop = asyncio.get_event_loop() server = loop.create_server(factory, '0.0.0.0', 9000) server = loop.run_until_complete(server) try: loop.run_forever() except KeyboardInterrupt: pass finally: server.close() loop.close()
def parse(self, response): # Extract fields title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) status = LAW.STATUS.xt(response) LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) # Extract foreign keys category = LAW.CATEGORY.xt(response) description = LAW.DESCRIPTION.xt(response) # Create category if we don't have it yet cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) # Create and save Law law_data = { 'title': title, 'status': status, 'description': description } law_item, law_created = Law.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, source_link=response.url, defaults=law_data) # Attach foreign keys law_item.keywords = self.parse_keywords(response) law_item.category = cat law_item.documents = self.parse_docs(response) law_item.save() # Log our progress if law_created: logtext = u"Created {} with id {}, LLP {} @ {}" else: logtext = u"Updated {} with id {}, LLP {} @ {}" logtext = logtext.format( red(title), cyan(u"[{}]".format(parl_id)), green(str(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) response.meta['law_item'] = law_item # is the tab 'Parlamentarisches Verfahren available?' if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'): self.parse_pre_parliament_steps(response)
def parse_inquiry_response(self, response): """ Callback function for parsing the inquiry responses """ inquiry_item = response.meta['inquiry_item'] source_link = response.url parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.RESPONSEDESCRIPTION.xt(response) LLP = inquiry_item.legislative_period category = INQUIRY.CATEGORY.xt(response) # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) try: sender_object = Person.objects.get( parl_id=INQUIRY.RESPONSESENDER.xt(response)) except: log.msg( red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}' .format(parl_id, LLP))) return # Create or update Inquiry item inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'sender': sender_object }) # Attach foreign Keys inquiryresponse_item.documents = self.parse_response_docs(response) inquiryresponse_item.category = cat # Save InquiryResponse object inquiryresponse_item.save() if inquiryresponse_created: logtext = u"[{} of {}] Created InquiryResponse {} with ID {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated InquiryResponse {} with ID {}, LLP {} @ {}" logtext = logtext.format(self.SCRAPED_COUNTER, self.TOTAL_COUNTER, cyan(title), cyan(u"{}".format(parl_id)), green(str(LLP)), blue(response.url)) log.msg(logtext, level=log.INFO) inquiry_item.response = inquiryresponse_item inquiry_item.save() return
def importEntities(args): for infile in args.infile: basename = os.path.basename(infile) if re.match(r"uniprot_[a-z]+\.dat\.gz", basename): print(blue("Importing UniProt file...")) from Translatron.Entities.UniProtImporter import importUniprot importUniprot(args, infile) elif re.match(r"d\d{4}.bin", basename): print(blue("Importing MeSH file...")) from Translatron.Entities.MeSHImporter import importMeSH importMeSH(args, infile) elif re.match(r"[a-z][a-z]wiki.+titles.+\.gz", basename): print(blue("Importing Wikipedia page title file...")) from Translatron.Entities.WikipediaImporter import importWikimediaPagelist importWikimediaPagelist(args, infile) else: print (red("Can't interpret entity input file (uniprot_sprot.dat.gz - UniProt) %s " % basename))
def print_section(text: str, color: bool = False) -> None: if color: text = ansicolor.blue(text.ljust(60, ' '), reverse=True) line = '' else: line = '\n' + ''.ljust(60, '-') print('\n' + text + line)
def test_colordiff(): x, y = colordiff("hi bob", "hi there", color_x=Colors.Red, color_y=Colors.Blue) fx = lambda s: red(s, reverse=True) fy = lambda s: blue(s, reverse=True) assert x == "hi " + fx("b") + fx("o") + fx("b") assert y == "hi " + fy("t") + fy("h") + fy("e") + fy("r") + fy("e")
def compact(args): "Compact one ore more table" #Setup raw YakDB connection conn = YakDB.Connection() conn.connect(args.req_endpoint) #Restory every table if the corresponding file exists if not args.no_documents: print (blue("Compacting document table... ", bold=True)) conn.compactRange(1) if not args.no_entities: print (blue("Compacting entity table... ", bold=True)) conn.compactRange(2) if not args.no_document_idx: print (blue("Compacting document index table... ", bold=True)) conn.compactRange(3) if not args.no_entity_idx: print (blue("Compacting entity index table... ", bold=True)) conn.compactRange(4)
def exportDump(args): #Setup raw YakDB connection conn = YakDB.Connection() conn.connect(args.req_endpoint) #Filenames to dump to filenames = __getDumpFilenames(args) #Dump every table if not args.no_documents: print (blue("Dumping document table to " + filenames[0], bold=True)) dumpYDF(conn, filenames[0], 1) if not args.no_entities: print (blue("Dumping entity table to " + filenames[1], bold=True)) dumpYDF(conn, filenames[1], 2) if not args.no_document_idx: print (blue("Dumping document index table to " + filenames[2], bold=True)) dumpYDF(conn, filenames[2], 3) if not args.no_entity_idx: print (blue("Dumping entity index table to " + filenames[3], bold=True)) dumpYDF(conn, filenames[3], 4)
def startHTTPServer(http_port=8080): print(blue("HTTP server starting up, listening on port %d..." % http_port)) #Fixes six issue: http://goo.gl/ebeWDN cherrypy.config.update({'engine.autoreload.on': False}) conf = { 'global': {'server.socket_host': '0.0.0.0', 'server.socket_port': http_port}, '/': { 'tools.staticdir.on': True, 'tools.staticdir.dir': os.path.join(os.path.abspath(os.getcwd()), "static"), 'tools.staticdir.index' : 'index.html', }, } print("Static: " + os.path.join(os.path.abspath(os.getcwd()), "static")) cherrypy.quickstart(TranslatronServer(), "/", conf)
def f_check(): global timeout, base_url timeout = args.timeout logging.captureWarnings(True) # Capture the ssl warnings with the standard logging module if args.ssl: base_url = "https://{}:{}/{}".format(args.host, args.port, args.url) else: base_url = "http://{}:{}/{}".format(args.host, args.port, args.url) f_verbose("[*] Program will check out WebLogic for CVE-2017-3506 & 10271 vulnerability.") if f_run(): print red("[x]") + " Your system is potentially vulnerable to XML Serialization attack!" else: print green("[*]") + " Your system is " + blue("safe!")
def invoke(cwd, args): logger.debug("Invoking: [%s] '%s'" % (cwd, ' '.join(args))) popen = subprocess.Popen(args, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (out, err) = popen.communicate() out = str(out).strip() err = str(err).strip() ret = popen.returncode if out: lines = out.split('\n') for line in lines: logger.debug("> %s" % ansicolor.blue(line)) if ret: if not err: err = ret logger.warn("Returned %s" % (err)) return ret, out, err
def readAndProcessXLIFF(lang, filename, fileid, indexer, autotranslator, upload=False, approve=False, autotranslate=True, preindex=False, overwrite=False, fullauto_account=False, postproc=identity): soup = parse_xliff_file(filename) autotranslated_count = process_xliff_soup(filename, soup, autotranslator, indexer, autotranslate=autotranslate, preindex=preindex, overwrite=overwrite, postproc=postproc) # If we are not autotranslating, stop here, no need to export if not autotranslate: return 0 # Export XLIFF outdir = "output-{}".format(lang) outfilename = filename.replace("cache/{}".format(lang), outdir) # Create directories & export if autotranslated_count > 0: os.makedirs(os.path.dirname(outfilename), exist_ok=True) #print(black("Exporting to {}".format(outfilename), bold=True)) export_xliff_file(soup, outfilename) # Upload if enabled if upload and autotranslated_count > 0: basename = os.path.basename(filename) print(blue("Uploading {} (approve={})...".format(basename, approve))) upload_file(outfilename, fileid, auto_approve=approve, lang=lang, fullauto_account=fullauto_account, duplicates=overwrite) print(green("Uploaded {}".format(basename))) return autotranslated_count
def startHTTPServer(http_port=8080): print(blue("HTTP server starting up, listening on port %d..." % http_port)) #Fixes six issue: http://goo.gl/ebeWDN cherrypy.config.update({'engine.autoreload.on': False}) conf = { 'global': { 'server.socket_host': '0.0.0.0', 'server.socket_port': http_port }, '/': { 'tools.staticdir.on': True, 'tools.staticdir.dir': os.path.join(os.path.abspath(os.getcwd()), "static"), 'tools.staticdir.index': 'index.html', }, } print("Static: " + os.path.join(os.path.abspath(os.getcwd()), "static")) cherrypy.quickstart(TranslatronServer(), "/", conf)
def f_verbose(value): if ("[X]" in value) or ("[+]" in value): f_save(value + '\n') col_cred = value.split('`') neutrino = '' for index, item in enumerate(col_cred): if index & 1: neutrino = neutrino + blue(item) else: neutrino += item if "[X]" in neutrino: print neutrino.replace("[X]", red("[X]")) elif "[+]" in neutrino: print neutrino.replace("[+]", yellow("[+]")) elif args.verbose: print neutrino.replace("[*]", green("[*]")).replace("[!]", magenta("[!]")) return
def parse_check_format(intv, verbose=False): """ Checks if a string could contain an interval. This function just exists to shorten read_timeline() to something that Python won't throw warnings about. There's no need to call this outside that function. :param intv: is the string to check. :param verbose: is an optional flag. When true, extra parsing information is printed to the console. Defaults to false. """ # if the string is shorter than 5 characters, it can't be [#] [#] [#], since that's # 3 numbers + 2 spaces. if it is less than 5, it's not an interval if len(intv) < 5: if verbose: print(black("\tinvalid length > ", bold=True), end='') print(add_quotes(intv)) return False # haven't ruled it out yet if verbose: print(blue("\tpossible interval > ", bold=True), end='') print(add_quotes(intv)) return True
def parse_parliament_steps(self, response): """ Callback function to parse the additional 'Parlamentarisches Verfahren' page """ law_item = response.meta['law_item'] phases = LAW.PHASES.xt(response) for phase in phases: # Create phase if we don't have it yet phase_item, created = Phase.objects.get_or_create( title=phase['title']) if created: log.msg(u"Created Phase {}".format( green(u'[{}]'.format(phase_item.title)))) # Create steps for step in phase['steps']: step_item, created = Step.objects.update_or_create( title=step['title']['text'], sortkey=step['sortkey'], date=step['date'], protocol_url=step['protocol_url'] if step['protocol_url'] else u'', law=law_item, phase=phase_item, source_link=response.url ) step_item.save() # Save statements for this step, if applicable if 'statements' in step['title']: for stmnt in step['title']['statements']: # Find the person pq = Person.objects.filter( source_link__endswith=stmnt['person_source_link']) if pq.exists() and pq.count() == 1: person_item = pq.first() st_data = { 'speech_type': stmnt['statement_type'], 'protocol_url': stmnt['protocol_link'][0] if stmnt['protocol_link'] else None } st_item, st_created = Statement.objects.update_or_create( index=stmnt['index'], person=person_item, step=step_item, defaults=st_data) # if st_created: # log.msg(u"Created Statement by {} on {}".format( # green( # u'[{}]'.format(person_item.full_name)), # step_item.date)) # else: # log.msg(u"Updated Statement by {} on {}".format( # green( # u'[{}]'.format(person_item.full_name)), # step_item.date)) else: # We can't save statements if we can't find the # Person log.msg( red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}").format( green( u'[{}]'.format(stmnt['person_name'])), blue( "[{}]".format(stmnt['person_source_link'])), red("{}").format( "" if pq.exists() else " not"), "" if pq.count() > 1 else ", but {} persons matching found!".format( pq.count()) )) continue
class InquiriesSpider(BaseSpider): BASE_URL = "{}/{}".format(BASE_HOST, "PAKT/JMAB/filter.psp") URLOPTIONS = { 'view': 'RSS', 'jsMode': 'RSS', 'xdocumentUri': '/PAKT/JMAB/index.shtml', 'NRBR': 'NR', 'anwenden': 'Anwenden', 'JMAB': 'J_JPR_M', 'VHG2': 'ALLE', 'SUCH': '', 'listeId': '105', 'FBEZ': 'FP_005' } name = "inquiries" inquiries_scraped = [] def __init__(self, **kw): super(InquiriesSpider, self).__init__(**kw) if 'llp' in kw: try: self.LLP = [int(kw['llp'])] except: pass self.cookies_seen = set() self.idlist = {} self.url_override = kw.get('url', None) def start_requests(self): """ Returns a list of URLs to scrape """ # This predefined list of URLs is chosen to include all types of # inquiries possible in the Austrian parliament in order to provide a # suitable testing surface for new functions. # urls = ["https://www.parlament.gv.at/PAKT/VHG/XXV/JPR/JPR_00019/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/JPR/JPR_00016/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/J/J_06954/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/M/M_00178/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/JEU/JEU_00003/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/J/J_06758/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_03089/index.shtml", # "https://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_03091/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_01155/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_06110/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_06651/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_04024/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_04025/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XX/M/M_00178/index.shtml"] urls = [] if not self.url_override else [self.url_override] if self.LLP and not self.url_override: for i in self.LLP: for nrbr in ['NR', 'BR']: roman_numeral = roman.toRoman(i) options = self.URLOPTIONS.copy() options['GP'] = roman_numeral options['NRBR'] = nrbr url_options = urlencode(options) url_llp = "{}?{}".format(self.BASE_URL, url_options) rss = feedparser.parse(url_llp) self.logger.info("GP {}: {} inquiries from {}".format( roman_numeral, len(rss['entries']), nrbr) ) urls = urls + [entry['link'] for entry in rss['entries']] self.TOTAL_COUNTER = len(urls) for url in urls: yield self.make_requests_from_url(url) def parse(self, response): self.SCRAPED_COUNTER += 1 source_link = response.url category = INQUIRY.CATEGORY.xt(response) parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.DESCRIPTION.xt(response) sender_objects = [] callback_requests = [] ts = GENERIC.TIMESTAMP.xt(response) # Inquiries from Bundesrat don't have an LLP => set None if("BR" in category): LLP = None else: LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts): self.logger.debug( green(u"[{} of {}] Skipping Inquiry, no changes: {}".format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title))) return # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.debug(u"Created category {}".format( green(u'[{}]'.format(category)))) # An inquiry can have multiple senders, but only a single recipient. # Try/catch in case person does not exist in the database. try: for sender_object in INQUIRY.SENDER.xt(response): sender_objects.append(Person.objects.get( parl_id=sender_object)) except: log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.SENDER.xt(response), parl_id, LLP))) return try: receiver_object = Person.objects.get( parl_id=INQUIRY.RECEIVER.xt(response)) except: log.warning(red(u'Receiver "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.RECEIVER.xt(response), parl_id, LLP))) return # Create or update Inquiry item inquiry_item, inquiry_created = Inquiry.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'receiver': receiver_object, 'ts': ts } ) if inquiry_created: inquiry_item.status = 'offen' # Attach foreign keys inquiry_item.keywords = self.parse_keywords(response) inquiry_item.documents = self.parse_docs(response) inquiry_item.category = cat inquiry_item.sender = sender_objects response.meta['inquiry_item'] = inquiry_item # Dringliche / Urgent inquiries have a different structure for steps # and history. This case distinction accomodates these different # structures. if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()): if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) else: response_link = self.parse_steps(response) if response_link: post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link), callback=self.parse_inquiry_response, dont_filter=True) post_req.meta['inquiry_item'] = inquiry_item callback_requests.append(post_req) # Save Inquiry item and log to terminal if created or updated. inquiry_item.save() if inquiry_created: logtext = u"[{} of {}] Created Inquiry {} with ID {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated Inquiry {} with ID {}, LLP {} @ {}" logtext = logtext.format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, cyan(title), cyan(u"{}".format(parl_id)), green(unicode(LLP)), blue(response.url), green(u"{}".format(inquiry_item.keywords)) ) log.msg(logtext, level=log.DEBUG if not self.SCRAPED_COUNTER%1000==0 else log.INFO) # log.msg(green("Open Callback requests: {}".format( # len(callback_requests))), level=log.INFO) return callback_requests def has_changes(self, parl_id, legislative_period, source_link, ts): if not Inquiry.objects.filter( parl_id=parl_id, legislative_period=legislative_period, source_link=source_link ).exists(): return True ts = ts.replace(tzinfo=pytz.utc) if Inquiry.objects.get( parl_id=parl_id, legislative_period=legislative_period, source_link=source_link).ts != ts: return True return False def parse_keywords(self, response): keywords = INQUIRY.KEYWORDS.xt(response) # Create all keywords we don't yet have in the DB keyword_items = [] for keyword in keywords: kw, created = Keyword.objects.get_or_create(title=keyword) if created: log.msg(u"Created keyword {}".format( green(u'[{}]'.format(keyword))),level=log.DEBUG) keyword_items.append(kw) return keyword_items def parse_docs(self, response): docs = INQUIRY.DOCS.xt(response) # Create all docs we don't yet have in the DB doc_items = [] for document in docs: doc, created = Document.objects.get_or_create( title=document['title'], html_link=document['html_url'], pdf_link=document['pdf_url'], stripped_html=None ) doc_items.append(doc) return doc_items def parse_response_docs(self, response): docs = INQUIRY.RESPONSEDOCS.xt(response) # Create all docs we don't yet have in the DB doc_items = [] for document in docs: doc, created = Document.objects.get_or_create( title=document['title'], html_link=document['html_url'], pdf_link=document['pdf_url'], stripped_html=None ) doc_items.append(doc) return doc_items def parse_steps(self, response): """ Callback function to parse the single-page history for normal inquiries """ response_link = [] inquiry_item = response.meta['inquiry_item'] # Get or created a default-phase for inquiries, because there are no phases in # simple inquiries. phase_item, created = Phase.objects.get_or_create( title='default_inqu') if created: log.msg(u"Created Phase {}".format( green(u'[{}]'.format(phase_item.title))),level=log.DEBUG) steps = INQUIRY.STEPS.xt(response) for step in steps: if "Schriftliche Beantwortung" in step["title"]: response_link = INQUIRY.RESPONSE_LINK.xt(response) for step in steps: step_item, created = Step.objects.update_or_create( title=step['title'], sortkey=step['sortkey'], date=step['date'], protocol_url=step['protocol_url'], law=inquiry_item, phase=phase_item, source_link=response.url ) step_item.save() if response_link: return response_link else: return def parse_parliament_steps(self, response): """ Callback function to parse the additional 'Parlamentarisches Verfahren' page. """ inquiry_item = response.meta['inquiry_item'] phases = INQUIRY.PHASES.xt(response) for phase in phases: # Create phase if we don't have it yet phase_item, created = Phase.objects.get_or_create( title=phase['title']) if created: log.msg(u"Created Phase {}".format( green(u'[{}]'.format(phase_item.title))),level=log.DEBUG) # Create steps for step in phase['steps']: step_item, created = Step.objects.update_or_create( title=step['title']['text'], sortkey=step['sortkey'], date=step['date'], protocol_url=step['protocol_url'], law=inquiry_item, phase=phase_item, source_link=response.url ) step_item.save() if created: log.msg(u"Created Step {}".format( green(u'[{}]'.format(step_item.title))),level=log.DEBUG) # Save statements for this step, if applicable if 'statements' in step['title']: for stmnt in step['title']['statements']: # Find the person pq = Person.objects.filter( source_link__endswith=stmnt['person_source_link']) if pq.exists() and pq.count() == 1: person_item = pq.first() st_data = { 'speech_type': stmnt['statement_type'], 'protocol_url': stmnt['protocol_link'] } st_item, st_created = Statement.objects.update_or_create( index=stmnt['index'], person=person_item, step=step_item, defaults=st_data) if st_created: log.msg(u"Created Statement by {} on {}".format( green( u'[{}]'.format(person_item.full_name)), step_item.date),level=log.DEBUG) else: log.msg(u"Updated Statement by {} on {}".format( green( u'[{}]'.format(person_item.full_name)), step_item.date),level=log.DEBUG) else: # We can't save statements if we can't find the # Person log.warning( red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}").format( green( u'[{}]'.format(stmnt['person_name'])), blue( "[{}]".format(stmnt['person_source_link'])), red("{}").format( "" if pq.exists() else " not"), "" if pq.count() > 1 else ", but {} persons matching found!".format( pq.count()) )) continue def parse_inquiry_response(self, response): """ Callback function for parsing the inquiry responses """ inquiry_item = response.meta.get('inquiry_item',None) # allow testing single urls for parsing errors source_link = response.url parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.RESPONSEDESCRIPTION.xt(response) LLP = inquiry_item.legislative_period if inquiry_item else None category = INQUIRY.CATEGORY.xt(response) # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category))),level=log.DEBUG) try: sender_object = Person.objects.get( parl_id=INQUIRY.RESPONSESENDER.xt(response)) except Exception, e: log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.RESPONSESENDER.xt(response), parl_id, LLP))) return if not inquiry_item: print locals() return # allow testing single urls for parsing errors # Create or update Inquiry item inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'sender': sender_object } ) # Attach foreign Keys inquiryresponse_item.documents = self.parse_response_docs(response) inquiryresponse_item.category = cat # Save InquiryResponse object inquiryresponse_item.save() if inquiryresponse_created: logtext = u"[{} of {}] Created InquiryResponse {} with ID {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated InquiryResponse {} with ID {}, LLP {} @ {}" logtext = logtext.format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, cyan(title), cyan(u"{}".format(parl_id)), green(unicode(LLP)), blue(response.url) ) log.msg(logtext, level=log.DEBUG if self.SCRAPED_COUNTER!=0 else log.INFO) inquiry_item.response = inquiryresponse_item inquiry_item.status = 'response_received' inquiry_item.save() return
def parse_parliament_steps(self, response): """ Callback function to parse the additional 'Parlamentarisches Verfahren' page. """ inquiry_item = response.meta['inquiry_item'] phases = INQUIRY.PHASES.xt(response) for phase in phases: # Create phase if we don't have it yet phase_item, created = Phase.objects.get_or_create( title=phase['title']) if created: log.msg(u"Created Phase {}".format( green(u'[{}]'.format(phase_item.title))),level=log.DEBUG) # Create steps for step in phase['steps']: step_item, created = Step.objects.update_or_create( title=step['title']['text'], sortkey=step['sortkey'], date=step['date'], protocol_url=step['protocol_url'], law=inquiry_item, phase=phase_item, source_link=response.url ) step_item.save() if created: log.msg(u"Created Step {}".format( green(u'[{}]'.format(step_item.title))),level=log.DEBUG) # Save statements for this step, if applicable if 'statements' in step['title']: for stmnt in step['title']['statements']: # Find the person pq = Person.objects.filter( source_link__endswith=stmnt['person_source_link']) if pq.exists() and pq.count() == 1: person_item = pq.first() st_data = { 'speech_type': stmnt['statement_type'], 'protocol_url': stmnt['protocol_link'] } st_item, st_created = Statement.objects.update_or_create( index=stmnt['index'], person=person_item, step=step_item, defaults=st_data) if st_created: log.msg(u"Created Statement by {} on {}".format( green( u'[{}]'.format(person_item.full_name)), step_item.date),level=log.DEBUG) else: log.msg(u"Updated Statement by {} on {}".format( green( u'[{}]'.format(person_item.full_name)), step_item.date),level=log.DEBUG) else: # We can't save statements if we can't find the # Person log.warning( red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}").format( green( u'[{}]'.format(stmnt['person_name'])), blue( "[{}]".format(stmnt['person_source_link'])), red("{}").format( "" if pq.exists() else " not"), "" if pq.count() > 1 else ", but {} persons matching found!".format( pq.count()) )) continue
def parse(self, response): self.SCRAPED_COUNTER += 1 source_link = response.url category = INQUIRY.CATEGORY.xt(response) parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.DESCRIPTION.xt(response) sender_objects = [] callback_requests = [] ts = GENERIC.TIMESTAMP.xt(response) # Inquiries from Bundesrat don't have an LLP => set None if("BR" in category): LLP = None else: LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts): self.logger.debug( green(u"[{} of {}] Skipping Inquiry, no changes: {}".format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title))) return # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.debug(u"Created category {}".format( green(u'[{}]'.format(category)))) # An inquiry can have multiple senders, but only a single recipient. # Try/catch in case person does not exist in the database. try: for sender_object in INQUIRY.SENDER.xt(response): sender_objects.append(Person.objects.get( parl_id=sender_object)) except: log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.SENDER.xt(response), parl_id, LLP))) return try: receiver_object = Person.objects.get( parl_id=INQUIRY.RECEIVER.xt(response)) except: log.warning(red(u'Receiver "{}" was not found in database, skipping Inquiry {} in LLP {}'.format( INQUIRY.RECEIVER.xt(response), parl_id, LLP))) return # Create or update Inquiry item inquiry_item, inquiry_created = Inquiry.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'receiver': receiver_object, 'ts': ts } ) if inquiry_created: inquiry_item.status = 'offen' # Attach foreign keys inquiry_item.keywords = self.parse_keywords(response) inquiry_item.documents = self.parse_docs(response) inquiry_item.category = cat inquiry_item.sender = sender_objects response.meta['inquiry_item'] = inquiry_item # Dringliche / Urgent inquiries have a different structure for steps # and history. This case distinction accomodates these different # structures. if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()): if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) else: response_link = self.parse_steps(response) if response_link: post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link), callback=self.parse_inquiry_response, dont_filter=True) post_req.meta['inquiry_item'] = inquiry_item callback_requests.append(post_req) # Save Inquiry item and log to terminal if created or updated. inquiry_item.save() if inquiry_created: logtext = u"[{} of {}] Created Inquiry {} with ID {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated Inquiry {} with ID {}, LLP {} @ {}" logtext = logtext.format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, cyan(title), cyan(u"{}".format(parl_id)), green(unicode(LLP)), blue(response.url), green(u"{}".format(inquiry_item.keywords)) ) log.msg(logtext, level=log.DEBUG if not self.SCRAPED_COUNTER%1000==0 else log.INFO) # log.msg(green("Open Callback requests: {}".format( # len(callback_requests))), level=log.INFO) return callback_requests
def parse(self, response): # Extract fields title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) ts = GENERIC.TIMESTAMP.xt(response) if not (u'BI' in parl_id or u'PET' in parl_id): # VBG have their parl_id only in the url parl_id = response.url.split('/')[-2] status = LAW.STATUS.xt(response) raw_llp = response.url.split('/')[-4] if raw_llp != u'BR': LLP = LegislativePeriod.objects.get(roman_numeral=raw_llp) else: LLP = None if not self.IGNORE_TIMESTAMP and not self.has_changes( parl_id, LLP, response.url, ts): self.logger.info( green(u"Skipping Petition, no changes: {}".format(title))) return # save ids and stuff for internals if LLP not in self.idlist: self.idlist[LLP] = {} self.idlist[LLP][response.url] = [parl_id, LLP] # Extract foreign keys category = self.parse_category(response) description = LAW.DESCRIPTION.xt(response) signing_url, signable = PETITION.SIGNING.xt(response) signature_count = PETITION.SIGNATURE_COUNT.xt(response) # Parse reference reference = self.parse_reference(response) # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(title), magenta(u"[{}]".format(parl_id)), green(str(LLP)), blue(response.url)) log.msg(logtext, level=log.INFO) # Create and save Petition petition_item, petition_item_created = Petition.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'status': status, 'source_link': response.url, 'description': description, 'signable': signable, 'signing_url': signing_url, 'signature_count': signature_count, 'reference': reference, 'ts': ts }) if not petition_item_created: petition_item.save() # Attach foreign keys petition_item.keywords = self.parse_keywords(response) petition_item.category = category petition_item.documents = self.parse_docs(response) petition_item.save() # Parse creators petition_creators = self.parse_creators(response) for petition_creator in petition_creators: petition_creator.created_petitions.add(petition_item) callback_requests = [] # is the tab 'Parlamentarisches Verfahren available?' if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): response.meta['petition_item'] = petition_item self.parse_parliament_steps(response) # Parse opinions opinions = PETITION.OPINIONS.xt(response) if opinions: for op in opinions: if Opinion.objects.filter(parl_id=op['parl_id']).exists(): continue post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']), callback=self.parse_opinion, dont_filter=True) post_req.meta['petition_item'] = petition_item post_req.meta['op_data'] = op callback_requests.append(post_req) # Only BI or PET (but not PET-BR) have online signatures if u'BI' in parl_id or u'PET' in parl_id and not u'PET-BR' in parl_id: signatures_base_url = '{}/PAKT/VHG/{}/{}/{}/filter.psp?xdocumentUri=/PAKT/VHG/{}/{}/{}/'\ 'index.shtml&GP_CODE={}&ITYP={}&INR={}&FBEZ=BI_001&R_1000=ALLE&STEP=&pageNumber=' raw_parl_id = petition_item.parl_id[1:-1].split('/') petition_type = raw_parl_id[1] petition_number = int(raw_parl_id[0]) url_parl_id = '{}_{}'.format(petition_type, petition_number) signatures_url = signatures_base_url.format( BASE_HOST, LLP.roman_numeral, petition_type, url_parl_id, LLP.roman_numeral, petition_type, url_parl_id, LLP.roman_numeral, petition_type, petition_number) post_req = scrapy.Request(signatures_url, callback=self.parse_signatures, dont_filter=True) post_req.meta['petition_item'] = petition_item callback_requests.append(post_req) log.msg(green("Open Callback requests: {}".format( len(callback_requests))), level=log.INFO) return callback_requests
def parse(self, response): source_link = response.url category = INQUIRY.CATEGORY.xt(response) parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.DESCRIPTION.xt(response) sender_objects = [] callback_requests = [] ts = GENERIC.TIMESTAMP.xt(response) # Inquiries from Bundesrat don't have an LLP => set None if("BR" in category): LLP = None else: LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) if not self.has_changes(parl_id, LLP, response.url, ts): self.logger.info( green(u"Skipping Inquiry, no changes: {}".format( title))) return # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) # An inquiry can have multiple senders, but only a single recipient. # Try/catch in case person does not exist in the database. try: for sender_object in INQUIRY.SENDER.xt(response): sender_objects.append(Person.objects.get( parl_id=sender_object)) except: log.msg(red(u'Sender was not found in database, skipping Inquiry {} in LLP {}'.format( parl_id, LLP))) return try: receiver_object = Person.objects.get( parl_id=INQUIRY.RECEIVER.xt(response)) except: log.msg(red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}'.format( parl_id, LLP))) return # Create or update Inquiry item inquiry_item, inquiry_created = Inquiry.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'receiver': receiver_object, 'ts': ts } ) # Attach foreign keys inquiry_item.keywords = self.parse_keywords(response) inquiry_item.documents = self.parse_docs(response) inquiry_item.category = cat inquiry_item.sender = sender_objects response.meta['inquiry_item'] = inquiry_item # Dringliche / Urgent inquiries have a different structure for steps # and history. This case distinction accomodates these different # structures. if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()): if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) else: response_link = self.parse_steps(response) if response_link: post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link), callback=self.parse_inquiry_response, dont_filter=True) post_req.meta['inquiry_item'] = inquiry_item callback_requests.append(post_req) # Save Inquiry item and log to terminal if created or updated. inquiry_item.save() if inquiry_created: logtext = u"Created Inquiry {} with ID {}, LLP {} @ {}" else: logtext = u"Updated Inquiry {} with ID {}, LLP {} @ {}" logtext = logtext.format( cyan(title), cyan(u"{}".format(parl_id)), green(str(LLP)), blue(response.url), green(u"{}".format(inquiry_item.keywords)) ) log.msg(logtext, level=log.INFO) log.msg(green("Open Callback requests: {}".format( len(callback_requests))), level=log.INFO) return callback_requests
def f_verbose(value): if args.verbose: print value.replace("[X]", red("[X]")).replace("[*]", green("[*]")).replace("[!]", magenta("[!]"))\ .replace("safe", blue("safe")) return
def handle_request(name_o, pass_o, verbose): headers_['Cookie'] = "" global count # 1 request request_1 = send_request(base_url, headers_, False) if 'Set-Cookie' in request_1.headers: headers_['Cookie'] = str( request_1.headers['Set-Cookie'] )[:str(request_1.headers['Set-Cookie']).index(";") + 1] if 'Location' in request_1.headers: # check redirect to old auth version url_2 = request_1.headers['Location'] # 2 request request_2 = send_request(url_2, headers_, False) soup = BeautifulSoup(request_2.text, "lxml") items = ["_FORM", "SubmitButton", "FORM_MAC_LIST"] url_3 = (str(soup.form['action'])) url_3 = base_url[:base_url.index("/OA_HTML")] + url_3 forms = [str(soup.find(id=obj)) for obj in items] for index, item in enumerate(forms): # stripping values if "value=" in item: forms[index] = item[ item.index("value=") + len("value=") + 1:item.index("\"", item.index("value=") + len("value=") + 1)] elif "_FORM_SUBMIT_BUTTON" in item: forms[index] = item[item.index("_FORM_SUBMIT_BUTTON") + len("_FORM_SUBMIT_BUTTON':") + 1:item.rindex("'});")] else: forms[index] = "" dictionary = dict(zip(items, forms)) dictionary['usernameField'] = name_o dictionary['passwordField'] = pass_o dictionary['_FORM_SUBMIT_BUTTON'] = dictionary.pop('SubmitButton') headers_[ 'Cookie'] = headers_['Cookie'] + request_2.headers['Set-Cookie'] # 3 request request_3 = send_request(url_3, headers_, False, dictionary) check = request_3.headers['Location'] if request_3.headers[ 'Location'] else "" if "errCode=FND_APPL_LOGIN_FAIL" in check: if verbose: print green('[*]') + "\tStatus: " + green( 'not found') + " : " + blue(name_o) + " : " + blue(pass_o) else: print red('[x]') + "\tStatus: " + red('found') + " : " + blue( name_o) + " : " + blue(pass_o) count += 1 else: request_2 = send_request(base_url, headers_, False, data={ 'username': name_o, 'password': pass_o, '_lAccessibility': 'N', 'langCode': 'US' }) soup = BeautifulSoup(request_2.text, "lxml") result = str(soup.p) result = result[result.index("status:") + len("status:"):result.index(",", result.index("status:"))]\ .strip(' ').strip('\'') if result == 'success': print red('[x]') + "\tStatus:" + red('found') + " : " + blue( name_o) + " : " + blue(pass_o) count += 1 elif verbose: print green('[*]') + "\tStatus: " + green( 'not found') + " : " + blue(name_o) + " : " + blue(pass_o) return
def parse(self, response): # Extract fields title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) ts = GENERIC.TIMESTAMP.xt(response) if not (u'BI' in parl_id or u'PET' in parl_id): # VBG have their parl_id only in the url parl_id = response.url.split('/')[-2] status = LAW.STATUS.xt(response) raw_llp = response.url.split('/')[-4] if raw_llp != u'BR': LLP = LegislativePeriod.objects.get( roman_numeral=raw_llp) else: LLP = None if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts): self.logger.info( green(u"Skipping Petition, no changes: {}".format( title))) return # save ids and stuff for internals if LLP not in self.idlist: self.idlist[LLP] = {} self.idlist[LLP][response.url] = [parl_id, LLP] # Extract foreign keys category = self.parse_category(response) description = LAW.DESCRIPTION.xt(response) signing_url, signable = PETITION.SIGNING.xt(response) signature_count = PETITION.SIGNATURE_COUNT.xt(response) # Parse reference reference = self.parse_reference(response) # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(title), magenta(u"[{}]".format(parl_id)), green(str(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) # Create and save Petition petition_item, petition_item_created = Petition.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'status': status, 'source_link': response.url, 'description': description, 'signable': signable, 'signing_url': signing_url, 'signature_count': signature_count, 'reference': reference, 'ts': ts } ) if not petition_item_created: petition_item.save() # Attach foreign keys petition_item.keywords = self.parse_keywords(response) petition_item.category = category petition_item.documents = self.parse_docs(response) petition_item.save() # Parse creators petition_creators = self.parse_creators(response) for petition_creator in petition_creators: petition_creator.created_petitions.add(petition_item) callback_requests = [] # is the tab 'Parlamentarisches Verfahren available?' if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): response.meta['petition_item'] = petition_item self.parse_parliament_steps(response) # Parse opinions opinions = PETITION.OPINIONS.xt(response) if opinions: for op in opinions: if Opinion.objects.filter(parl_id=op['parl_id']).exists(): continue post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']), callback=self.parse_opinion, dont_filter=True) post_req.meta['petition_item'] = petition_item post_req.meta['op_data'] = op callback_requests.append(post_req) # Only BI or PET (but not PET-BR) have online signatures if u'BI' in parl_id or u'PET' in parl_id and not u'PET-BR' in parl_id: signatures_base_url = '{}/PAKT/VHG/{}/{}/{}/filter.psp?xdocumentUri=/PAKT/VHG/{}/{}/{}/'\ 'index.shtml&GP_CODE={}&ITYP={}&INR={}&FBEZ=BI_001&R_1000=ALLE&STEP=&pageNumber=' raw_parl_id = petition_item.parl_id[1:-1].split('/') petition_type = raw_parl_id[1] petition_number = int(raw_parl_id[0]) url_parl_id = '{}_{}'.format(petition_type, petition_number) signatures_url = signatures_base_url.format(BASE_HOST, LLP.roman_numeral, petition_type, url_parl_id, LLP.roman_numeral, petition_type, url_parl_id, LLP.roman_numeral, petition_type, petition_number) post_req = scrapy.Request(signatures_url, callback=self.parse_signatures, dont_filter=True) post_req.meta['petition_item'] = petition_item callback_requests.append(post_req) log.msg(green("Open Callback requests: {}".format( len(callback_requests))), level=log.INFO) return callback_requests
def parse(self, response): # Parse parl_id = COMITTEE.url_to_parlid(response.url)[1] ts = GENERIC.TIMESTAMP.xt(response) LLP = COMITTEE.LLP.xt(response) name = COMITTEE.NAME.xt(response) if LLP is not None: nrbr = 'Nationalrat' legislative_period = LegislativePeriod.objects.get( roman_numeral=LLP) # NR comittees are always "active", only BR comittees are either active or inactive active = True else: nrbr = 'Bundesrat' legislative_period = None # BR comittees are active if they are not "aufgelöst" active = COMITTEE.ACTIVE.xt(response) # main-comittee parl_id starts with the number 1 # sub-comittees parl_id start with the number 2 if not parl_id.startswith(u'(1/'): try: parent_parl_id = u'(1/{}'.format(parl_id.split('/')[1]) parent_comitee = Comittee.objects.get( parl_id=parent_parl_id, legislative_period=legislative_period) except Comittee.DoesNotExist: parent_comitee = None else: parent_comitee = None if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, legislative_period, nrbr, response.url, ts): self.logger.info( green(u"Skipping Comittee, no changes: {}".format( name))) return # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(name), magenta(u"[{}]".format(parl_id)), green(unicode(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) description = COMITTEE.DESCRIPTION.xt(response) comittee_data = { 'description': description, 'name': name, 'source_link': response.url, 'parent_comittee': parent_comitee, 'ts': ts } try: comittee_item, created_comittee = Comittee.objects.update_or_create( parl_id=parl_id, legislative_period=legislative_period, nrbr=nrbr, active=active, defaults=comittee_data ) except: log.msg( u"Could not update/create Comittee {}".format(name), level=log.ERROR) return # import ipdb # ipdb.set_trace() meetings = COMITTEE.MEETINGS.xt(response) comittee_laws = [] for meeting in meetings: agenda_data = meeting['agenda'] if agenda_data is not None: agenda_item, agenda_created = Document.objects.get_or_create( **agenda_data) else: agenda_item = None meeting_data = { 'agenda': agenda_item } # Log our progress logtext = u"Scraping meeting no. {} of {} on {}".format( red(meeting['number']), magenta(name), green(str(meeting['date'].date())), ) log.msg(logtext, level=log.INFO) meeting_item, meeting_created = ComitteeMeeting.objects.update_or_create( number=meeting['number'], date=meeting['date'], comittee=comittee_item, defaults=meeting_data ) for topic in meeting['topics']: if topic['law'] is not None: law = topic['law'] law_item = self.parse_law(law) if law_item is not None: comittee_laws.append(law_item) else: law_item = None agenda_topic_data = { 'comment': topic['comment'], 'law': law_item, } agenda_topic_item, agenda_topic_created = ComitteeAgendaTopic.objects.update_or_create( number=topic['number'], meeting=meeting_item, text=topic['text'], defaults=agenda_topic_data, ) # parse Verhandlungsgegenstaende and Veroeffentlichungen laws_and_reports = COMITTEE.LAWS.xt(response) for law in laws_and_reports: # Log our progress logtext = u"Adding law with id {}, LLP {} to {}".format( magenta(u"[{}]".format(law['parl_id'])), green(law['llp']), blue(name) ) log.msg(logtext, level=log.INFO) law_item = self.parse_law(law) if law_item is not None: comittee_laws.append(law_item) comittee_item.laws.add(*comittee_laws) comittee_item.save()
def parse(self, response): # Extract fields ts = GENERIC.TIMESTAMP.xt(response) title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) if not self.IGNORE_TIMESTAMP and not self.has_changes( parl_id, LLP, response.url, ts): self.logger.info( green(u"Skipping Law, no changes: {}".format(title))) return # save ids and stuff for internals if LLP not in self.idlist: self.idlist[LLP] = {} self.idlist[LLP][response.url] = [parl_id, LLP] # Extract foreign keys category = self.parse_category(response) description = PRELAW.DESCRIPTION.xt(response) # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(title), magenta(u"[{}]".format(parl_id)), green(unicode(LLP)), blue(response.url)) log.msg(logtext, level=log.INFO) # Create and save Law pre_law_data = { 'title': title, 'description': description, 'source_link': response.url, 'ts': ts } law_item, created = Law.objects.get_or_create(parl_id=parl_id, legislative_period=LLP, defaults=pre_law_data) if not created: law_item.save() # Attach foreign keys law_item.keywords = self.parse_keywords(response) law_item.category = category law_item.documents = self.parse_docs(response) law_item.save() # Parse opinions opinions = PRELAW.OPINIONS.xt(response) callback_requests = [] # is the tab 'Parlamentarisches Verfahren available?' if opinions: skipped_ops = 0 for op in opinions: if Opinion.objects.filter(parl_id=op['parl_id']).exists(): skipped_ops += 1 continue post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']), callback=self.parse_opinion, dont_filter=True) post_req.meta['law_item'] = law_item post_req.meta['op_data'] = op callback_requests.append(post_req) log.msg(green("Open/Skipped Callback requests: {}/{}".format( len(callback_requests), skipped_ops)), level=log.INFO) return callback_requests
def parse(self, response): self.SCRAPED_COUNTER += 1 LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) # Extract fields ts = GENERIC.TIMESTAMP.xt(response) title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) status = LAW.STATUS.xt(response) if not self.IGNORE_TIMESTAMP and not self.has_changes( parl_id, LLP, response.url, ts): self.logger.info( green(u"[{} of {}] Skipping Law, no changes: {}".format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title))) return # Extract foreign keys category = LAW.CATEGORY.xt(response) description = LAW.DESCRIPTION.xt(response) # Create category if we don't have it yet cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) # Create and save Law law_data = { 'title': title, 'status': status, 'description': description, 'ts': ts, 'source_link': response.url, } law_item, law_created = Law.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults=law_data) # Attach foreign keys law_item.keywords = self.parse_keywords(response) law_item.category = cat law_item.documents = self.parse_docs(response) law_item.save() # Log our progress if law_created: logtext = u"[{} of {}] Created {} with id {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated {} with id {}, LLP {} @ {}" logtext = logtext.format(self.SCRAPED_COUNTER, self.TOTAL_COUNTER, red(title), cyan(u"[{}]".format(parl_id)), green(unicode(LLP)), blue(response.url)) log.msg(logtext, level=log.INFO) response.meta['law_item'] = law_item # is the tab 'Parlamentarisches Verfahren available?' if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'): self.parse_pre_parliament_steps(response)
def parse(self, response): # Extract fields ts = GENERIC.TIMESTAMP.xt(response) title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts): self.logger.info( green(u"Skipping Law, no changes: {}".format( title))) return # save ids and stuff for internals if LLP not in self.idlist: self.idlist[LLP] = {} self.idlist[LLP][response.url] = [parl_id, LLP] # Extract foreign keys category = self.parse_category(response) description = PRELAW.DESCRIPTION.xt(response) # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(title), magenta(u"[{}]".format(parl_id)), green(str(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) # Create and save Law pre_law_data = { 'title': title, 'description': description, 'ts': ts } law_item, created = Law.objects.get_or_create( parl_id=parl_id, source_link=response.url, legislative_period=LLP, defaults=pre_law_data) if not created: law_item.save() # Attach foreign keys law_item.keywords = self.parse_keywords(response) law_item.category = category law_item.documents = self.parse_docs(response) law_item.save() # Parse opinions opinions = PRELAW.OPINIONS.xt(response) callback_requests = [] # is the tab 'Parlamentarisches Verfahren available?' if opinions: skipped_ops = 0 for op in opinions: if Opinion.objects.filter(parl_id=op['parl_id']).exists(): skipped_ops += 1 continue post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']), callback=self.parse_opinion, dont_filter=True) post_req.meta['law_item'] = law_item post_req.meta['op_data'] = op callback_requests.append(post_req) log.msg(green("Open/Skipped Callback requests: {}/{}".format( len(callback_requests), skipped_ops)), level=log.INFO) return callback_requests
def parse(self, response): self.SCRAPED_COUNTER += 1 LLP = LegislativePeriod.objects.get( roman_numeral=response.url.split('/')[-4]) # Extract fields ts = GENERIC.TIMESTAMP.xt(response) title = LAW.TITLE.xt(response) parl_id = LAW.PARL_ID.xt(response) status = LAW.STATUS.xt(response) if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts): self.logger.info( green(u"[{} of {}] Skipping Law, no changes: {}".format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title))) return # Extract foreign keys category = LAW.CATEGORY.xt(response) description = LAW.DESCRIPTION.xt(response) # Create category if we don't have it yet cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) # Create and save Law law_data = { 'title': title, 'status': status, 'description': description, 'ts': ts, 'source_link': response.url, } law_item, law_created = Law.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults=law_data) # Attach foreign keys law_item.keywords = self.parse_keywords(response) law_item.category = cat law_item.documents = self.parse_docs(response) law_item.save() # Log our progress if law_created: logtext = u"[{} of {}] Created {} with id {}, LLP {} @ {}" else: logtext = u"[{} of {}] Updated {} with id {}, LLP {} @ {}" logtext = logtext.format( self.SCRAPED_COUNTER, self.TOTAL_COUNTER, red(title), cyan(u"[{}]".format(parl_id)), green(unicode(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) response.meta['law_item'] = law_item # is the tab 'Parlamentarisches Verfahren available?' if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'): self.parse_parliament_steps(response) if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'): self.parse_pre_parliament_steps(response)
def parse(self, response): # Parse parl_id = COMITTEE.url_to_parlid(response.url)[1] ts = GENERIC.TIMESTAMP.xt(response) llp = COMITTEE.LLP.xt(response) name = COMITTEE.NAME.xt(response) if llp is not None: nrbr = 'Nationalrat' legislative_period = LegislativePeriod.objects.get( roman_numeral=llp) # NR comittees are "active" if they are in the current LLP active = ( legislative_period == LegislativePeriod.objects.get_current()) else: nrbr = 'Bundesrat' legislative_period = None # BR comittees are active if they are not "aufgelöst" active = COMITTEE.ACTIVE.xt(response) # main-comittee parl_id starts with the number 1 # sub-comittees parl_id start with the number 2 if not parl_id.startswith(u'(1/'): try: parent_parl_id = u'(1/{}'.format(parl_id.split('/')[1]) parent_comitee = Comittee.objects.get( parl_id=parent_parl_id, legislative_period=legislative_period) except Comittee.DoesNotExist: parent_comitee = None else: parent_comitee = None if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, legislative_period, nrbr, response.url, ts): self.logger.info( green(u"Skipping Comittee, no changes: {}".format( name))) return # Log our progress logtext = u"Scraping {} with id {}, LLP {} @ {}".format( red(name), magenta(u"[{}]".format(parl_id)), green(unicode(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) description = COMITTEE.DESCRIPTION.xt(response) comittee_data = { 'description': description, 'name': name, 'source_link': response.url, 'parent_comittee': parent_comitee, 'active': active, 'ts': ts } try: comittee_item, created_comittee = Comittee.objects.update_or_create( parl_id=parl_id, legislative_period=legislative_period, nrbr=nrbr, defaults=comittee_data ) except: log.msg( u"Could not update/create Comittee {}".format(name), level=log.ERROR) return # import ipdb # ipdb.set_trace() meetings = COMITTEE.MEETINGS.xt(response) comittee_laws = [] for meeting in meetings: agenda_data = meeting['agenda'] if agenda_data is not None: agenda_item, agenda_created = Document.objects.get_or_create( **agenda_data) else: agenda_item = None meeting_data = { 'agenda': agenda_item } # Log our progress logtext = u"Scraping meeting no. {} of {} on {}".format( red(meeting['number']), magenta(name), green(str(meeting['date'].date())), ) log.msg(logtext, level=log.INFO) meeting_item, meeting_created = ComitteeMeeting.objects.update_or_create( number=meeting['number'], date=meeting['date'], comittee=comittee_item, defaults=meeting_data ) for topic in meeting['topics']: if topic['law'] is not None: law = topic['law'] law_item = self.parse_law(law) if law_item is not None: comittee_laws.append(law_item) else: law_item = None agenda_topic_data = { 'comment': topic['comment'], 'law': law_item, } agenda_topic_item, agenda_topic_created = ComitteeAgendaTopic.objects.update_or_create( number=topic['number'], meeting=meeting_item, text=topic['text'], defaults=agenda_topic_data, ) # parse Verhandlungsgegenstaende and Veroeffentlichungen laws_and_reports = COMITTEE.LAWS.xt(response) for law in laws_and_reports: # Log our progress logtext = u"Adding law with id {}, LLP {} to {}".format( magenta(u"[{}]".format(law['parl_id'])), green(law['llp']), blue(name) ) log.msg(logtext, level=log.INFO) law_item = self.parse_law(law) if law_item is not None: comittee_laws.append(law_item) comittee_item.laws.add(*comittee_laws) comittee_item.save()
def download_all(files): from .Download import download_file urlprefix = "http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/" print(blue("Downloading Natural Earth files...", bold=True)) for file in files: download_file(file, urlprefix + file)
def parse_inquiry_response(self, response): """ Callback function for parsing the inquiry responses """ inquiry_item = response.meta['inquiry_item'] source_link = response.url parl_id = response.url.split('/')[-2] title = INQUIRY.TITLE.xt(response) description = INQUIRY.RESPONSEDESCRIPTION.xt(response) LLP = inquiry_item.legislative_period category = INQUIRY.CATEGORY.xt(response) # Get or create Category object for the inquiry and log to screen if new # category is created. cat, created = Category.objects.get_or_create(title=category) if created: log.msg(u"Created category {}".format( green(u'[{}]'.format(category)))) try: sender_object = Person.objects.get( parl_id=INQUIRY.RESPONSESENDER.xt(response)) except: log.msg(red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}'.format( parl_id, LLP))) return # Create or update Inquiry item inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create( parl_id=parl_id, legislative_period=LLP, defaults={ 'title': title, 'source_link': source_link, 'description': description, 'sender': sender_object } ) # Attach foreign Keys inquiryresponse_item.documents = self.parse_docs(response) inquiryresponse_item.category = cat # Save InquiryResponse object inquiryresponse_item.save() if inquiryresponse_created: logtext = u"Created InquiryResponse {} with ID {}, LLP {} @ {}" else: logtext = u"Updated InquiryResponse {} with ID {}, LLP {} @ {}" logtext = logtext.format( cyan(title), cyan(u"{}".format(parl_id)), green(str(LLP)), blue(response.url) ) log.msg(logtext, level=log.INFO) inquiry_item.response = inquiryresponse_item inquiry_item.save() return