def parse_other(self, response): summary_full = response.json() #self.debug(summary_full) title = summary_full["title"] if "summary" in summary_full: summary = summary_full["summary"] else: summary = "" text_to_search = summary_full["title"] + " " + summary if self.search_keywords(text_to_search, self.keyword_dict, self.negative_keyword_dict): item = ScrapyOfficialNewspapersItem() item['country'] = self.country item['state'] = self.state_name item['data_source'] = self.source item['law_class'] = summary_full['category'] item['title'] = summary_full['title'] item['reference'] = summary_full['granuleId'] item['authorship'] = summary_full['agencies'][0]['name'] item['summary'] = summary item['publication_date'] = summary_full['dateIssued'] item['url'] = summary_full['download']['txtLink'].replace( 'htm', 'summary?api_key=') doc_url = summary_full['download'][ 'txtLink'] + f'?api_key={self.API_key}' # self.debug(doc_url) item['file_urls'] = [doc_url] item['doc_name'] = self.HSA1_encoding( summary_full['download']['txtLink'] + f'?api_key={self.API_key}') + ".txt" yield item
def parse_other(self, response): self.counter += 1 # print("\n----- Reccord processed succesfully\n\n", response.xpath('//*[@id="menu1"]/table').get(), "\n") item = ScrapyOfficialNewspapersItem() item['country'] = self.country item['geo_code'] = self.geo_code item['level'] = self.level item['data_source'] = self.source item['url'] = response.url # print("\n--- ", response.url) item['doc_type'] = 'pdf' item['doc_class'] = "" table = response.xpath('//*[@id="menu1"]/table').get() # print(response) item, good = self.parse_table(item, table) doc_url = response.xpath('//*[@id="menu2"]/comment()[2]').get().split( "\"")[1] # print("\n+++++ ", doc_url, " +++++\n") item['file_urls'] = [doc_url] item['doc_url'] = doc_url if good: return item else: pass
def parse_other(self, response): item = ScrapyOfficialNewspapersItem() for norm in json.loads(response.text)[0]: text_to_search = self.clean_text(norm['TITULO_NORMA']) + self.clean_text( norm['DESCRIPCION']) if self.search_keywords(text_to_search, self.keyword_dict, self.negative_keyword_dict): norm_id = norm['IDNORMA'] norm_url = f'https://www.bcn.cl/leychile/navegar?idNorma={norm_id}' doc_name = f'CHL/policy_{norm_id}' doc_type = 'txt' publication_date = norm['FECHA_PUBLICACION'] pub_date_format = parse(publication_date, ['es']).strftime('%Y-%m-%d') doc_path = str(norm_id) + '.' + str(pub_date_format) + '.0.0%23' doc_url = f'https://nuevo.leychile.cl/servicios/Consulta/Exportar?radioExportar=Normas&exportar_formato={doc_type}&nombrearchivo={doc_name}&exportar_con_notas_bcn=False&exportar_con_notas_originales=False&exportar_con_notas_al_pie=False&hddResultadoExportar={doc_path}' item['country'] = self.country item['state'] = self.state_name item['data_source'] = self.source item["law_class"] = norm['NORMA'] item['title'] = norm['TITULO_NORMA'] item['reference'] = norm['TIPO'] item['authorship'] = norm['ORGANISMO'] item['summary'] = norm['DESCRIPCION'] item['publication_date'] = pub_date_format item['url'] = norm_url item['doc_url'] = doc_url item['doc_name'] = self.HSA1_encoding(doc_url) for column in item: item[column] = item[column] or False yield item
def parse(self, response): item = ScrapyOfficialNewspapersItem() for norm in json.loads(response.text)[0]: norm_id = norm['IDNORMA'] norm_url = f'https://www.bcn.cl/leychile/navegar?idNorma={norm_id}' doc_name = f'CHL/policy_{norm_id}' doc_type = 'pdf' publication_date = norm['FECHA_PUBLICACION'] pub_date_format = parse(publication_date, ['es']).strftime('%Y-%m-%d') doc_path = str(norm_id) + '.' + str(pub_date_format) + '.0.0%23' doc_url = f'https://nuevo.leychile.cl/servicios/Consulta/Exportar?radioExportar=Normas&exportar_formato={doc_type}&nombrearchivo={doc_name}&exportar_con_notas_bcn=False&exportar_con_notas_originales=False&exportar_con_notas_al_pie=False&hddResultadoExportar={doc_path}' item['country'] = self.country item['geo_code'] = self.geo_code item['level'] = self.level item['source'] = self.source item['title'] = norm['TITULO_NORMA'] item['authorship'] = norm['ORGANISMO'] item['resume'] = norm['DESCRIPCION'] item['reference'] = norm_id item['publication_date'] = pub_date_format item['enforcement_date'] = norm['FECHA_PROMULGACION'] item['reference'] = None item['url'] = norm_url item['doc_url'] = doc_url item['file_urls'] = [doc_url] item['doc_name'] = doc_name + '.' + doc_type item['doc_type'] = doc_type yield item
def parse_other(self, response): item = ScrapyOfficialNewspapersItem() for norm in json.loads(response.text)['hits']: if 'subjectOrganizationCode' in norm['metadata']: ref = norm['metadata']['subjectOrganizationCode'] else: ref = "" try: item['reference'] = ref item['doc_url'] = 'https://busquedas.elperuano.pe/download/url/' + str(norm['metadata']['slug']) text_to_search = self.clean_text(norm['metadata']['description']) + " " + self.clean_text( norm['metadata']['slug']) + " " + self.clean_text(norm['highlightedText']) if self.search_keywords(text_to_search, self.keyword_dict, self.negative_keyword_dict): item['country'] = self.country item['state'] = self.state_name item["law_class"] = "" #TODO: look at the right field when adjusted. item['data_source'] = self.source item['authorship'] = norm['metadata']['editionName'] item['summary'] = self.clean_text(norm['metadata']['description']) item['title'] = self.clean_text(norm['metadata']['description']) item['publication_date'] = norm['metadata']['publicationDate']['formatted'] item['enforcement_date'] = item['publication_date'] item['url'] = 'https://busquedas.elperuano.pe' + str(norm['url_link']) item['doc_name'] = self.HSA1_encoding(doc_url) yield item except Exception as e: print(e) pass
def parse(self, response): if len( response.xpath( "//*[contains(text(), 'No hay datos para la fecha')]")): print("No publication in this date") pass else: url = response.url year = int(url.split("=")[1][:4]) month = int(url.split("=")[2][:2]) day = int(url.split("=")[3][:2]) date = datetime.datetime(year=year, month=month, day=day) item = ScrapyOfficialNewspapersItem() trs = response.xpath('/html//td[@class = "subtitle_azul"]')[ 0].xpath('//tr').xpath('following-sibling::tr[1]') authorship = None for tr in trs: authorship_new = tr.xpath( 'td[@class = "subtitle_azul"]/text()').get() resume_aux = tr.xpath('td/a[@class = "enlaces"]/text()').get() url_aux = tr.xpath('td/a[@class = "enlaces"]/@href').get() if authorship != authorship_new and authorship_new != None: authorship = authorship_new if resume_aux and resume_aux != "Ver más": resume = resume_aux.replace('\t', '').replace('\n', '') if self.search_keywords(resume, self.keyword_dict, self.negative_keyword_dict): doc_url = self.url + url_aux + "&print=true" reference = doc_url.split("codigo=")[1][:7] item['country'] = self.country item['geo_code'] = self.geo_code item['level'] = self.level item['data_source'] = self.source item['title'] = resume item['reference'] = reference item['authorship'] = str(authorship) item['resume'] = resume item['publication_date'] = date item['enforcement_date'] = date item['url'] = self.url item['doc_url'] = doc_url item['doc_name'] = reference + 'html' item['doc_type'] = self.doc_type item['doc_class'] = '' item['file_urls'] = [doc_url] yield item
def parse(self, response): # notas = [] # notas.append(json.loads(response.text)["NotasMatutinas"]) # notas.append(json.loads(response.text)["NotasVespertinas"]) # notas.append(json.loads(response.text)["NotasExtraordinarias"]) for nota in json.loads(response.text)["NotasMatutinas"]: if 'titulo' in nota: text_to_search = nota["titulo"] if self.search_keywords( text_to_search, self.keyword_dict, self.negative_keyword_dict ) and nota['nombreCodOrgaUno'] != self.authorship_to_exclude: item = ScrapyOfficialNewspapersItem() item['country'] = self.country item['state'] = self.state_name item['data_source'] = self.source if 'tipoNota' in nota: item['law_class'] = nota['tipoNota'] else: item['law_class'] = '' item['title'] = nota["titulo"] codigo_nota = nota['codNota'] item['reference'] = codigo_nota if 'codOrgaDos' in nota: item['authorship'] = nota[ 'nombreCodOrgaUno'] + "/" + nota['codOrgaDos'] else: item['authorship'] = nota['nombreCodOrgaUno'] item['summary'] = "" item['publication_date'] = nota['fecha'] item['url'] = self.start_url doc_url = f'https://www.dof.gob.mx/nota_detalle.php?codigo={codigo_nota}&fecha={self.day_doc_url}&print=true' doc_name = self.HSA1_encoding(doc_url) + ".txt" item['doc_name'] = doc_name #self.debug(f"\n ################# \n {doc_url} \n ###############") #self.debug(doc_name) yield item yield scrapy.Request(doc_url, dont_filter=True, callback=self.parse_other, cb_kwargs=dict(document=doc_name, url=doc_url)) else: pass
def parse_other(self, response): item = ScrapyOfficialNewspapersItem() for norm in json.loads(response.text)['hits']: if 'subjectOrganizationCode' in norm['metadata']: ref = norm['metadata']['subjectOrganizationCode'] else: ref = "" try: item['reference'] = ref item[ 'doc_url'] = 'https://busquedas.elperuano.pe/download/url/' + str( norm['metadata']['slug']) text_to_search = self.clean_text( norm['metadata']['description']) + " " + self.clean_text( norm['metadata']['slug']) + " " + self.clean_text( norm['highlightedText']) if self.search_keywords(text_to_search, self.keyword_dict, self.negative_keyword_dict): item['country'] = self.country item['geo_code'] = self.geo_code item['level'] = self.level item['data_source'] = self.source item['authorship'] = norm['metadata']['editionName'] item['resume'] = self.clean_text( norm['metadata']['description']) item['title'] = self.clean_text( norm['metadata']['description']) item['publication_date'] = norm['metadata'][ 'publicationDate']['formatted'] item['enforcement_date'] = item['publication_date'] item['url'] = 'https://busquedas.elperuano.pe' + str( norm['url_link']) item['doc_name'] = ('PER/policy_' + norm['metadata']['name']) item['doc_type'] = 'pdf' item['doc_class'] = norm['metadata']['industry'] item['file_urls'] = [item['doc_url']] yield item except Exception as e: print(e) pass
def parse_other(self, response): self.counter += 1 # print("\n----- Reccord processed succesfully\n\n", response.xpath('//*[@id="menu1"]/table').get(), "\n") item = ScrapyOfficialNewspapersItem() item['country'] = self.country item['state'] = self.state_name item['data_source'] = self.source item['url'] = response.url table = response.xpath('//*[@id="menu1"]/table').get() item, good = self.parse_table(item, table) doc_url = response.xpath('//*[@id="menu2"]/comment()[2]').get().split( "\"")[1] item['file_urls'] = [doc_url] item['doc_name'] = self.HSA1_encoding(doc_url) if good: yield item else: pass
def parse_month_bulletin(self, response): # Get all table headers of the web page #total_headers = response.css('table').css('thead').css('th::text').extract_first() #item = defaultdict.fromkeys(total_headers) item = ScrapyOfficialNewspapersItem() # We take only the first table, we could loop through the two tables if there is a need to get the information of the second table #id = table.attrib['id'] # Notices or filings #table_headers = table.css('thead').css('th::text').extract() table = response.css('table')[0] ## Iterate over table rows for tr in table.css('tbody').css('tr'): # Get all text in a single row of the table row_values = tr.css('td::text').getall() row_values = list( filter(None, map(lambda x: " ".join(x.split()), row_values))) #assert len(table_headers) == len(row_values), 'More row values than table headers!' # Get the link of a single row of the table doc_url = response.urljoin(tr.css('a::attr(href)').extract_first()) # Populate item dictionary #item.update(dict(zip(table_headers, row_values))) #self.debug(row_values) text_to_search = row_values[3] if self.search_keywords( text_to_search, self.keyword_dict, self.negative_keyword_dict) or self.scrapable == "True": item['country'] = self.country item['state'] = self.state_name item['data_source'] = self.source item['law_class'] = '' item['title'] = row_values[3] item['reference'] = '' item['authorship'] = row_values[1] item['summary'] = '' item['publication_date'] = parse( row_values[2].split(' ')[0]).strftime('%Y-%m-%d') item['url'] = response.url item['doc_url'] = doc_url item['doc_name'] = self.HSA1_encoding(doc_url) yield item
def parse(self, response): item = ScrapyOfficialNewspapersItem() for norm in json.loads(response.text)['hits']: item['country'] = self.country item['geo_code'] = self.geo_code item['level'] = self.level item['source'] = self.source item['title'] = norm['metadata']['subjectCode'] item['authorship'] = '' item['resume'] = norm['metadata']['description'] item['reference'] = norm['metadata']['originalDocumentId'] item['publication_date'] = norm['metadata']['publicationDate'][ 'formatted'] item['enforcement_date'] = '' item['url'] = 'https://busquedas.elperuano.pe' + str( norm['url_link']) item[ 'doc_url'] = 'https://busquedas.elperuano.pe/download/url/' + str( norm['metadata']['slug']) item['doc_name'] = 'PER/policy_' + norm['metadata']['name'] item['doc_type'] = 'pdf' yield item
def parse_other(self, response): flag = True self.state_name = "Federal" ministry = "" department = "" for tr in response.css('#tb2 table tr'): if "Location" in tr.css('td')[0].get(): self.state_name = tr.css('td::text')[1].get() if "Act ID" in tr.css('td')[0].get(): reference = tr.css('td::text')[1].get() if "Enactment" in tr.css('td::text')[0].get(): publication_date = tr.css('td::text')[1].get() if "Short Title" in tr.css('td::text')[0].get(): title = self.remove_html_tags(tr.css('td')[1].get()).replace(" ", " ").lstrip().rstrip() if "Long Title" in tr.css('td::text')[0].get(): summary = self.remove_html_tags(tr.css('td')[1].get()).lstrip().rstrip() flag = False if "Ministry" in tr.css('td')[0].get(): ministry = tr.css('td::text')[1].get() if "Department" in tr.css('td')[0].get(): department = tr.css('td::text')[1].get() if flag: summary = '' text_to_search = title + " " + summary # if self.negative_keyword_filter(text_to_search, self.negative_keyword_dict): if self.search_keywords(text_to_search, self.keyword_dict, self.negative_keyword_dict): self.debug(title) item = ScrapyOfficialNewspapersItem() item['country'] = self.country item['state'] = self.state_name item['data_source'] = self.source item['law_class'] = "Act" item['title'] = title item['reference'] = reference item['authorship'] = ministry + "/" + department item['summary'] = summary item['publication_date'] = publication_date item['url'] = self.details_url doc_url = response.urljoin(response.css('p#short_title').css('a::attr(href)').get()) item['file_urls'] = [doc_url] item['doc_name'] = self.HSA1_encoding(doc_url) + doc_url.split('#')[0][-4:] yield item for type in self.code_type: id = "myModal" + str(type) for tr in response.css(f'div#{id}').css(f'table#myTable{self.code_type[type]} tr'): check_title = tr.css('td::text')[1].get() if check_title not in self.done_dictionary: self.done_dictionary[check_title] = 0 item = ScrapyOfficialNewspapersItem() item['country'] = self.country item['state'] = self.state_name item['data_source'] = self.source item['law_class'] = self.code_type_url[type] item['title'] = tr.css('td::text')[1].get() item['reference'] = "" item['authorship'] = "" item['summary'] = "" item['publication_date'] = tr.css('td::text')[0].get() item['url'] = self.details_url doc_id = tr.css('td')[2].css('a::attr(href)').get().split("=")[1] doc_id = doc_id.split("/")[0] doc_file = tr.css('td')[2].css('a::attr(href)').get().split("=")[2].lstrip().rstrip() doc_url = "https://upload.indiacode.nic.in/showfile?actid=" + doc_id + "&type=" + self.code_type_url[type].lower() + "&filename=" + doc_file item['file_urls'] = [doc_url] item['doc_name'] = self.HSA1_encoding(doc_url) + doc_url.split('#')[0][-4:] yield item