Python ScrapyOfficialNewspapersItemの例、scrapy_official_newspapers.items.ScrapyOfficialNewspapersItem Pythonの例

コード例 #1

0

ファイルを表示

ファイル: USFR_1.py プロジェクト: thefirebanks/policy-data-analyzer

    def parse_other(self, response):
        summary_full = response.json()
        #self.debug(summary_full)
        title = summary_full["title"]
        if "summary" in summary_full:
            summary = summary_full["summary"]
        else:
            summary = ""
        text_to_search = summary_full["title"] + " " + summary
        if self.search_keywords(text_to_search, self.keyword_dict,
                                self.negative_keyword_dict):
            item = ScrapyOfficialNewspapersItem()
            item['country'] = self.country
            item['state'] = self.state_name
            item['data_source'] = self.source
            item['law_class'] = summary_full['category']
            item['title'] = summary_full['title']
            item['reference'] = summary_full['granuleId']
            item['authorship'] = summary_full['agencies'][0]['name']
            item['summary'] = summary
            item['publication_date'] = summary_full['dateIssued']
            item['url'] = summary_full['download']['txtLink'].replace(
                'htm', 'summary?api_key=')
            doc_url = summary_full['download'][
                'txtLink'] + f'?api_key={self.API_key}'
            # self.debug(doc_url)
            item['file_urls'] = [doc_url]
            item['doc_name'] = self.HSA1_encoding(
                summary_full['download']['txtLink'] +
                f'?api_key={self.API_key}') + ".txt"

            yield item

コード例 #2

0

ファイルを表示

ファイル: elsalvador.py プロジェクト: rongfang323/policy-data-analyzer

    def parse_other(self, response):
        self.counter += 1
        # print("\n----- Reccord processed succesfully\n\n", response.xpath('//*[@id="menu1"]/table').get(), "\n")
        item = ScrapyOfficialNewspapersItem()
        item['country'] = self.country
        item['geo_code'] = self.geo_code
        item['level'] = self.level
        item['data_source'] = self.source
        item['url'] = response.url
        # print("\n--- ", response.url)
        item['doc_type'] = 'pdf'
        item['doc_class'] = ""

        table = response.xpath('//*[@id="menu1"]/table').get()

        # print(response)
        item, good = self.parse_table(item, table)

        doc_url = response.xpath('//*[@id="menu2"]/comment()[2]').get().split(
            "\"")[1]
        # print("\n+++++ ", doc_url, " +++++\n")
        item['file_urls'] = [doc_url]
        item['doc_url'] = doc_url
        if good:
            return item
        else:
            pass

コード例 #3

0

ファイルを表示

 def parse_other(self, response):
     item = ScrapyOfficialNewspapersItem()
     for norm in json.loads(response.text)[0]:
         text_to_search = self.clean_text(norm['TITULO_NORMA']) + self.clean_text( norm['DESCRIPCION'])
         if self.search_keywords(text_to_search, self.keyword_dict, self.negative_keyword_dict):
             norm_id = norm['IDNORMA']
             norm_url = f'https://www.bcn.cl/leychile/navegar?idNorma={norm_id}'
             doc_name = f'CHL/policy_{norm_id}'
             doc_type = 'txt'
             publication_date = norm['FECHA_PUBLICACION']
             pub_date_format = parse(publication_date, ['es']).strftime('%Y-%m-%d')
             doc_path = str(norm_id) + '.' + str(pub_date_format) + '.0.0%23'
             doc_url = f'https://nuevo.leychile.cl/servicios/Consulta/Exportar?radioExportar=Normas&exportar_formato={doc_type}&nombrearchivo={doc_name}&exportar_con_notas_bcn=False&exportar_con_notas_originales=False&exportar_con_notas_al_pie=False&hddResultadoExportar={doc_path}'
             item['country'] = self.country
             item['state'] = self.state_name
             item['data_source'] = self.source
             item["law_class"] = norm['NORMA']
             item['title'] = norm['TITULO_NORMA']
             item['reference'] = norm['TIPO']
             item['authorship'] = norm['ORGANISMO']
             item['summary'] = norm['DESCRIPCION']
             item['publication_date'] = pub_date_format
             item['url'] = norm_url
             item['doc_url'] = doc_url
             item['doc_name'] = self.HSA1_encoding(doc_url)
             for column in item:
                 item[column] = item[column] or False
             yield item

コード例 #4

0

ファイルを表示

ファイル: leychile_spider.py プロジェクト: frapercan/scrapy_omdena_latam

 def parse(self, response):
     item = ScrapyOfficialNewspapersItem()
     for norm in json.loads(response.text)[0]:
         norm_id = norm['IDNORMA']
         norm_url = f'https://www.bcn.cl/leychile/navegar?idNorma={norm_id}'
         doc_name = f'CHL/policy_{norm_id}'
         doc_type = 'pdf'
         publication_date = norm['FECHA_PUBLICACION']
         pub_date_format = parse(publication_date,
                                 ['es']).strftime('%Y-%m-%d')
         doc_path = str(norm_id) + '.' + str(pub_date_format) + '.0.0%23'
         doc_url = f'https://nuevo.leychile.cl/servicios/Consulta/Exportar?radioExportar=Normas&exportar_formato={doc_type}&nombrearchivo={doc_name}&exportar_con_notas_bcn=False&exportar_con_notas_originales=False&exportar_con_notas_al_pie=False&hddResultadoExportar={doc_path}'
         item['country'] = self.country
         item['geo_code'] = self.geo_code
         item['level'] = self.level
         item['source'] = self.source
         item['title'] = norm['TITULO_NORMA']
         item['authorship'] = norm['ORGANISMO']
         item['resume'] = norm['DESCRIPCION']
         item['reference'] = norm_id
         item['publication_date'] = pub_date_format
         item['enforcement_date'] = norm['FECHA_PROMULGACION']
         item['reference'] = None
         item['url'] = norm_url
         item['doc_url'] = doc_url
         item['file_urls'] = [doc_url]
         item['doc_name'] = doc_name + '.' + doc_type
         item['doc_type'] = doc_type
         yield item

コード例 #5

0

ファイルを表示

 def parse_other(self, response):
     item = ScrapyOfficialNewspapersItem()
     for norm in json.loads(response.text)['hits']:
         if 'subjectOrganizationCode' in norm['metadata']:
             ref = norm['metadata']['subjectOrganizationCode']
         else:
             ref = ""
         try:
             item['reference'] = ref
             item['doc_url'] = 'https://busquedas.elperuano.pe/download/url/' + str(norm['metadata']['slug'])
             text_to_search = self.clean_text(norm['metadata']['description']) + " " + self.clean_text(
                 norm['metadata']['slug']) + " " + self.clean_text(norm['highlightedText'])
             if self.search_keywords(text_to_search, self.keyword_dict, self.negative_keyword_dict):
                 item['country'] = self.country
                 item['state'] = self.state_name
                 item["law_class"] = "" #TODO: look at the right field when adjusted.
                 item['data_source'] = self.source
                 item['authorship'] = norm['metadata']['editionName']
                 item['summary'] = self.clean_text(norm['metadata']['description'])
                 item['title'] = self.clean_text(norm['metadata']['description'])
                 item['publication_date'] = norm['metadata']['publicationDate']['formatted']
                 item['enforcement_date'] = item['publication_date']
                 item['url'] = 'https://busquedas.elperuano.pe' + str(norm['url_link'])
                 item['doc_name'] = self.HSA1_encoding(doc_url)
                 yield item
         except Exception as e:
             print(e)
             pass

コード例 #6

0

ファイルを表示

ファイル: mexicoDOF.py プロジェクト: jordiplanascuchi/policy-data-analyzer

    def parse(self, response):
        if len(
                response.xpath(
                    "//*[contains(text(), 'No hay datos para la fecha')]")):
            print("No publication in this date")
            pass
        else:
            url = response.url
            year = int(url.split("=")[1][:4])
            month = int(url.split("=")[2][:2])
            day = int(url.split("=")[3][:2])
            date = datetime.datetime(year=year, month=month, day=day)
            item = ScrapyOfficialNewspapersItem()
            trs = response.xpath('/html//td[@class = "subtitle_azul"]')[
                0].xpath('//tr').xpath('following-sibling::tr[1]')

            authorship = None
            for tr in trs:
                authorship_new = tr.xpath(
                    'td[@class = "subtitle_azul"]/text()').get()
                resume_aux = tr.xpath('td/a[@class = "enlaces"]/text()').get()
                url_aux = tr.xpath('td/a[@class = "enlaces"]/@href').get()

                if authorship != authorship_new and authorship_new != None:
                    authorship = authorship_new
                if resume_aux and resume_aux != "Ver más":
                    resume = resume_aux.replace('\t', '').replace('\n', '')
                    if self.search_keywords(resume, self.keyword_dict,
                                            self.negative_keyword_dict):
                        doc_url = self.url + url_aux + "&print=true"
                        reference = doc_url.split("codigo=")[1][:7]
                        item['country'] = self.country
                        item['geo_code'] = self.geo_code
                        item['level'] = self.level
                        item['data_source'] = self.source
                        item['title'] = resume
                        item['reference'] = reference
                        item['authorship'] = str(authorship)
                        item['resume'] = resume
                        item['publication_date'] = date
                        item['enforcement_date'] = date
                        item['url'] = self.url
                        item['doc_url'] = doc_url
                        item['doc_name'] = reference + 'html'
                        item['doc_type'] = self.doc_type
                        item['doc_class'] = ''
                        item['file_urls'] = [doc_url]
                        yield item

コード例 #7

0

ファイルを表示

    def parse(self, response):
        # notas = []
        # notas.append(json.loads(response.text)["NotasMatutinas"])
        # notas.append(json.loads(response.text)["NotasVespertinas"])
        # notas.append(json.loads(response.text)["NotasExtraordinarias"])

        for nota in json.loads(response.text)["NotasMatutinas"]:
            if 'titulo' in nota:
                text_to_search = nota["titulo"]
                if self.search_keywords(
                        text_to_search, self.keyword_dict,
                        self.negative_keyword_dict
                ) and nota['nombreCodOrgaUno'] != self.authorship_to_exclude:
                    item = ScrapyOfficialNewspapersItem()
                    item['country'] = self.country
                    item['state'] = self.state_name
                    item['data_source'] = self.source
                    if 'tipoNota' in nota:
                        item['law_class'] = nota['tipoNota']
                    else:
                        item['law_class'] = ''
                    item['title'] = nota["titulo"]
                    codigo_nota = nota['codNota']
                    item['reference'] = codigo_nota
                    if 'codOrgaDos' in nota:
                        item['authorship'] = nota[
                            'nombreCodOrgaUno'] + "/" + nota['codOrgaDos']
                    else:
                        item['authorship'] = nota['nombreCodOrgaUno']
                    item['summary'] = ""
                    item['publication_date'] = nota['fecha']
                    item['url'] = self.start_url
                    doc_url = f'https://www.dof.gob.mx/nota_detalle.php?codigo={codigo_nota}&fecha={self.day_doc_url}&print=true'
                    doc_name = self.HSA1_encoding(doc_url) + ".txt"
                    item['doc_name'] = doc_name
                    #self.debug(f"\n       #################       \n {doc_url} \n   ###############")
                    #self.debug(doc_name)
                    yield item
                    yield scrapy.Request(doc_url,
                                         dont_filter=True,
                                         callback=self.parse_other,
                                         cb_kwargs=dict(document=doc_name,
                                                        url=doc_url))
            else:
                pass

コード例 #8

0

ファイルを表示

ファイル: elsalvador.py プロジェクト: jordiplanascuchi/policy-data-analyzer

 def parse_other(self, response):
     item = ScrapyOfficialNewspapersItem()
     for norm in json.loads(response.text)['hits']:
         if 'subjectOrganizationCode' in norm['metadata']:
             ref = norm['metadata']['subjectOrganizationCode']
         else:
             ref = ""
         try:
             item['reference'] = ref
             item[
                 'doc_url'] = 'https://busquedas.elperuano.pe/download/url/' + str(
                     norm['metadata']['slug'])
             text_to_search = self.clean_text(
                 norm['metadata']['description']) + " " + self.clean_text(
                     norm['metadata']['slug']) + " " + self.clean_text(
                         norm['highlightedText'])
             if self.search_keywords(text_to_search, self.keyword_dict,
                                     self.negative_keyword_dict):
                 item['country'] = self.country
                 item['geo_code'] = self.geo_code
                 item['level'] = self.level
                 item['data_source'] = self.source
                 item['authorship'] = norm['metadata']['editionName']
                 item['resume'] = self.clean_text(
                     norm['metadata']['description'])
                 item['title'] = self.clean_text(
                     norm['metadata']['description'])
                 item['publication_date'] = norm['metadata'][
                     'publicationDate']['formatted']
                 item['enforcement_date'] = item['publication_date']
                 item['url'] = 'https://busquedas.elperuano.pe' + str(
                     norm['url_link'])
                 item['doc_name'] = ('PER/policy_' +
                                     norm['metadata']['name'])
                 item['doc_type'] = 'pdf'
                 item['doc_class'] = norm['metadata']['industry']
                 item['file_urls'] = [item['doc_url']]
                 yield item
         except Exception as e:
             print(e)
             pass

コード例 #9

0

ファイルを表示

    def parse_other(self, response):
        self.counter += 1
        # print("\n----- Reccord processed succesfully\n\n", response.xpath('//*[@id="menu1"]/table').get(), "\n")
        item = ScrapyOfficialNewspapersItem()
        item['country'] = self.country
        item['state'] = self.state_name
        item['data_source'] = self.source
        item['url'] = response.url

        table = response.xpath('//*[@id="menu1"]/table').get()

        item, good = self.parse_table(item, table)

        doc_url = response.xpath('//*[@id="menu2"]/comment()[2]').get().split(
            "\"")[1]
        item['file_urls'] = [doc_url]
        item['doc_name'] = self.HSA1_encoding(doc_url)
        if good:
            yield item
        else:
            pass

コード例 #10

0

ファイルを表示

    def parse_month_bulletin(self, response):
        # Get all table headers of the web page
        #total_headers = response.css('table').css('thead').css('th::text').extract_first()
        #item = defaultdict.fromkeys(total_headers)
        item = ScrapyOfficialNewspapersItem()

        # We take only the first table, we could loop through the two tables if there is a need to get the information of the second table
        #id = table.attrib['id']  # Notices or filings
        #table_headers = table.css('thead').css('th::text').extract()
        table = response.css('table')[0]
        ## Iterate over table rows
        for tr in table.css('tbody').css('tr'):
            # Get all text in a single row of the table
            row_values = tr.css('td::text').getall()
            row_values = list(
                filter(None, map(lambda x: " ".join(x.split()), row_values)))
            #assert len(table_headers) == len(row_values), 'More row values than table headers!'
            # Get the link of a single row of the table
            doc_url = response.urljoin(tr.css('a::attr(href)').extract_first())
            # Populate item dictionary
            #item.update(dict(zip(table_headers, row_values)))
            #self.debug(row_values)
            text_to_search = row_values[3]
            if self.search_keywords(
                    text_to_search, self.keyword_dict,
                    self.negative_keyword_dict) or self.scrapable == "True":
                item['country'] = self.country
                item['state'] = self.state_name
                item['data_source'] = self.source
                item['law_class'] = ''
                item['title'] = row_values[3]
                item['reference'] = ''
                item['authorship'] = row_values[1]
                item['summary'] = ''
                item['publication_date'] = parse(
                    row_values[2].split(' ')[0]).strftime('%Y-%m-%d')
                item['url'] = response.url
                item['doc_url'] = doc_url
                item['doc_name'] = self.HSA1_encoding(doc_url)
                yield item

コード例 #11

0

ファイルを表示

    def parse(self, response):
        item = ScrapyOfficialNewspapersItem()
        for norm in json.loads(response.text)['hits']:
            item['country'] = self.country
            item['geo_code'] = self.geo_code
            item['level'] = self.level
            item['source'] = self.source
            item['title'] = norm['metadata']['subjectCode']
            item['authorship'] = ''
            item['resume'] = norm['metadata']['description']
            item['reference'] = norm['metadata']['originalDocumentId']
            item['publication_date'] = norm['metadata']['publicationDate'][
                'formatted']
            item['enforcement_date'] = ''
            item['url'] = 'https://busquedas.elperuano.pe' + str(
                norm['url_link'])
            item[
                'doc_url'] = 'https://busquedas.elperuano.pe/download/url/' + str(
                    norm['metadata']['slug'])
            item['doc_name'] = 'PER/policy_' + norm['metadata']['name']
            item['doc_type'] = 'pdf'

            yield item

コード例 #12

0

ファイルを表示

    def parse_other(self, response):
        flag = True
        self.state_name = "Federal"
        ministry = ""
        department = ""
        for tr in response.css('#tb2 table tr'):
            if "Location" in tr.css('td')[0].get():
                self.state_name = tr.css('td::text')[1].get()
            if "Act ID" in tr.css('td')[0].get():
                reference = tr.css('td::text')[1].get()
            if "Enactment" in tr.css('td::text')[0].get():
                publication_date = tr.css('td::text')[1].get()
            if "Short Title" in tr.css('td::text')[0].get():
                title = self.remove_html_tags(tr.css('td')[1].get()).replace("  ", " ").lstrip().rstrip()
            if "Long Title" in tr.css('td::text')[0].get():
                summary = self.remove_html_tags(tr.css('td')[1].get()).lstrip().rstrip()
                flag = False
            if "Ministry" in tr.css('td')[0].get():
                ministry = tr.css('td::text')[1].get()
            if "Department" in tr.css('td')[0].get():
                department = tr.css('td::text')[1].get()
        if flag:
            summary = ''

        text_to_search = title + " " + summary
        # if self.negative_keyword_filter(text_to_search, self.negative_keyword_dict):
        if self.search_keywords(text_to_search, self.keyword_dict, self.negative_keyword_dict):
            self.debug(title)
            item = ScrapyOfficialNewspapersItem()

            item['country'] = self.country
            item['state'] = self.state_name
            item['data_source'] = self.source
            item['law_class'] = "Act"
            item['title'] = title
            item['reference'] = reference
            item['authorship'] = ministry + "/" + department
            item['summary'] = summary
            item['publication_date'] = publication_date
            item['url'] = self.details_url
            doc_url = response.urljoin(response.css('p#short_title').css('a::attr(href)').get())
            item['file_urls'] = [doc_url]
            item['doc_name'] = self.HSA1_encoding(doc_url) + doc_url.split('#')[0][-4:]
            yield item

            for type in self.code_type:
                id = "myModal" + str(type)
                for tr in response.css(f'div#{id}').css(f'table#myTable{self.code_type[type]} tr'):
                    check_title = tr.css('td::text')[1].get()
                    if check_title not in self.done_dictionary:
                        self.done_dictionary[check_title] = 0

                        item = ScrapyOfficialNewspapersItem()

                        item['country'] = self.country
                        item['state'] = self.state_name
                        item['data_source'] = self.source
                        item['law_class'] = self.code_type_url[type]
                        item['title'] = tr.css('td::text')[1].get()
                        item['reference'] = ""
                        item['authorship'] = ""
                        item['summary'] = ""
                        item['publication_date'] = tr.css('td::text')[0].get()
                        item['url'] = self.details_url
                        doc_id = tr.css('td')[2].css('a::attr(href)').get().split("=")[1]
                        doc_id = doc_id.split("/")[0]
                        doc_file = tr.css('td')[2].css('a::attr(href)').get().split("=")[2].lstrip().rstrip()
                        doc_url = "https://upload.indiacode.nic.in/showfile?actid=" + doc_id + "&type=" + self.code_type_url[type].lower() + "&filename=" + doc_file
                        item['file_urls'] = [doc_url]
                        item['doc_name'] = self.HSA1_encoding(doc_url) + doc_url.split('#')[0][-4:]
                        yield item