def __init__(self):
        self.databaseController = ApiRequest()
        self.arrayNoticias = []
        self.indexData = 0
        self.name_crawl = 'LOG G1: '
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--ignore-certificate-errors')
        #self.driver = webdriver.Chrome(
        #ChromeDriverManager().install(), options=chrome_options)
        #self.driver2 = webdriver.Chrome(
        #ChromeDriverManager().install(), options=chrome_options)

        self.driver = webdriver.Remote("http://localhost:4444/wd/hub",
                                       options=chrome_options)
        self.driver2 = webdriver.Remote("http://localhost:4444/wd/hub",
                                        options=chrome_options)

        self.dates = [
            # {'dataInicial': str(ano) + '-01-01', 'dataFinal':str(ano) + '-01-31'},
            # {'dataInicial': str(ano) + '-02-01', 'dataFinal':str(ano) + '-02-29'},
            # {'dataInicial': str(ano) + '-03-01', 'dataFinal':str(ano) + '-03-31'},
            # {'dataInicial': str(ano) + '-04-01', 'dataFinal':str(ano) + '-04-30'},
            # {'dataInicial': str(ano) + '-05-01', 'dataFinal':str(ano) + '-05-31'},
            # {'dataInicial':  str(ano) + '-06-01', 'dataFinal': str(ano) + '-06-30'},
            # {'dataInicial': str(ano) + '-07-01',
            # 'dataFinal': str(ano) + '-07-31'},
            # {'dataInicial': str(ano) + '-08-01',
            # 'dataFinal': str(ano) + '-08-31'}
            #  {'dataInicial': str(ano) + '-09-01',
            # 'dataFinal': str(ano) + '-09-30'},
            #  {'dataInicial': str(ano) + '-10-29',
            #'dataFinal': str(ano) + '-10-31'}
            # {'dataInicial': '2020-12-17',
            # 'dataFinal': str(ano) + '-01-04'}
            {
                'dataInicial': str(data_inicial),
                'dataFinal': str(data_final)
            }
        ]
        start_at = 0

        print(self.name_crawl + 'DATA ' + self.dates[start_at]['dataInicial'] +
              ' ' + self.dates[start_at]['dataFinal'])
        self.parse('https://g1.globo.com/busca/?q=coronavirus&page=1',
                   start_at)
Example #2
0
    def parse_page(self, response):

        databaseController = ApiRequest()
        item = response.meta["item"]

        data_request = {'link': item['link']}
        is_inDatabase = databaseController.make_request('check_exist_database', data_request)
        if is_inDatabase:
            return

        tools = CustonTools()
        wordList = tools.get_key_word_list()
        tags = []
        formattedData = tools.format_dia(item['dia'])
        item['dia'] = formattedData

        self.limit_time = tools.compare_dates(formattedData)

        if self.limit_time:
            return print(self.name_crawl + 'Finalizou busca dentro do periodo de 1 mes')

        html = response.css('div.item-page').get()
        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)

            if isWordInHtml is None:
                pass
            else:
                tags.append(word)
                item['noticia'] = tools.cleanHTML(html)
                item['tags'] = ','.join(str(tag) for tag in tags)

        print(self.name_crawl + 'NOVA ' + item['titulo'])
        if not databaseController.make_request('inserir', item):
            print(self.name_crawl + 'Erro ao salvar no banco de dados')
Example #3
0
 def __init__(self):
     self.databaseController = ApiRequest()
     self.limit_time = False
Example #4
0
class QuotesSpider(scrapy.Spider):
    name = "get_correiobraziliense"
    start_urls = [
        'https://www.correiobraziliense.com.br/busca/coronavirus?json=63c055b-c8a7-4010-92c6-01803d6e752e&offset=0',
    ]

    def __init__(self):
        self.databaseController = ApiRequest()
        self.limit_time = False

    def parse(self, response):
        if self.limit_time:
            return print('Finalizou busca dentro do periodo de 1 mes')
        request = None
        try:
            request = requests.get(url=response.url)
        except requests.exceptions.RequestException as e:
            print(e)
            pass

        # extracting data in json format
        data = request.json()
        try:
            nextPage = data['next']

            for item in get_data(data['news']):
                data_request = {'link': item['link']}
                if not self.databaseController.make_request(
                        'check_exist_database', data_request):
                    print('NOVA ' + item['titulo'])
                    yield scrapy.Request(item['link'],
                                         meta={"item": item},
                                         callback=self.extract_html)

            print(nextPage)
            yield scrapy.Request(nextPage, callback=self.parse)

        except:
            print('finalizou')

    def extract_html(self, response):
        tools = CustonTools()
        tags = []
        item = response.meta["item"]
        formatted_data = tools.format_dia(item['dia'])
        self.limit_time = tools.compare_dates(formatted_data)

        item['dia'] = formatted_data
        word_list = tools.get_key_word_list()
        try:
            html = response.xpath(
                '//div[@class="txt-serif js-article-box article-box article-box-capitalize mt-15"]'
            ).get()
        except:
            print('falhou ao obter html' + item['link'])

        for word in word_list:

            isWordInHtml = tools.check_word_in_html(word)(html)

            isWordInTitulo = tools.check_word_in_html(word)(item['titulo'])

            if isWordInHtml is None and isWordInTitulo is None:
                pass
            else:

                tags.append(word)
                # print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                # print('tags adicionadas ' + word)

        if not item['tags'] is None:
            # print('vai limpar o html')
            try:
                item['noticia'] = tools.cleanHTML(html)
                # print('limpou noticia')
                # print('vai salvar no banco')
                if not self.databaseController.make_request('inserir', item):
                    print('Erro ao salvar no banco de dados')
                # print('chegou ao final do extract html')
            except Exception as ex:
                print('erro na noticia: ' + item['link'])
                print(ex)
                pass
        else:
            # print('Noticia não possui tags ' + item['link'])
            pass
        return item
Example #5
0
 def __init__(self, **kwargs):
     self.limit_time = False
     self.name_crawl = 'LOG OGLOBO:'
     self.databaseController = ApiRequest()
     self.link = 'https://oglobo.globo.com/api/v1/vermais/24219742/conteudo.json?pagina=0&versao=v1' \
                 '&tiposDeConteudo=materia,coluna,infografico,listaFatos,materiaEmCapitulos,linkExterno '
Example #6
0
class QuotesSpider(scrapy.Spider):
    name = "get_oglobo2"
    start_urls = [
        'https://scrapy.org',
    ]

    def __init__(self, **kwargs):
        self.limit_time = False
        self.name_crawl = 'LOG OGLOBO:'
        self.databaseController = ApiRequest()
        self.link = 'https://oglobo.globo.com/api/v1/vermais/24219742/conteudo.json?pagina=0&versao=v1' \
                    '&tiposDeConteudo=materia,coluna,infografico,listaFatos,materiaEmCapitulos,linkExterno '

    def parse(self, response):

        is_enabled = True

        while is_enabled and not self.limit_time:

            r = requests.get(self.link)

            # extracting data in json format
            data = r.json()[0]

            if len(data['conteudos']) > 0:
                parsed = urlparse.urlparse(data['paginacao']['urlProxima'])
                self.link = 'https://oglobo.globo.com/api/v1/vermais/24219742/conteudo.json?' + parsed.query
                for item in get_data(data['conteudos']):

                    data_request = {'link': item['link']}

                    if not self.databaseController.make_request(
                            'check_exist_database', data_request):

                        yield scrapy.Request(item['link'],
                                             meta={"item": item},
                                             callback=self.extract_html)

                print(self.name_crawl + 'nextPage ' + self.link)

            else:
                is_enabled = False

        print(self.name_crawl + 'Finalizou busca dentro do periodo de 1 mes')

    def extract_html(self, response):
        tools = CustonTools()
        tags = []
        item = response.meta["item"]

        dia = response.xpath('//div[@class="article__date"]/text()').get(
        ).split(' ')[0].replace(' ', '').replace('\n', ' ').replace('\r', '')
        item['descricao'] = response.xpath(
            '//div[@class="article__subtitle"]/text()').get().replace(
                '\n', ' ').replace('\r', '')

        formattedData = tools.format_dia(dia)
        self.limit_time = tools.compare_dates(formattedData)
        item['dia'] = formattedData

        wordList = tools.get_key_word_list()

        html = response.xpath(
            '//div[@class="article__content-container protected-content"]'
        ).get()
        if not html:
            print('pegou main')
            html = response.css('main').get()

        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)

            if isWordInHtml is None:
                pass
            else:

                tags.append(word)
                # print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                # print('tags adicionadas ' + word)

        if not item['tags'] is None:
            # print('vai limpar o html')
            try:
                html = tools.clean_html_class_oglobo(html)
                item['noticia'] = tools.cleanHTML(html)

                print(self.name_crawl + 'NOVA ' + item['titulo'])
                if not self.databaseController.make_request('inserir', item):
                    print(self.name_crawl + 'Erro ao salvar no banco de dados')
                # print('armazenou noticia ' + item['titulo'])
                # print('chegou ao final do extract html')
            except Exception as ex:
                print('erro na noticia: ' + item['link'])
                print(ex)
                pass

        else:
            # print(self.name_crawl + 'Noticia não possui tags ' + item['link'])
            pass

        yield item
 def save_to_database(self, item):
     self.databaseController = ApiRequest()
     self.databaseController.insert_to_database(item)
class QuotesSpider(scrapy.Spider):
    name = "get_gauchazh"
    start_urls = [
        'https://scrapy.org',

    ]

    def __init__(self):
        self.arrayNoticias = []
        self.indexData = 0
        chrome_options = Options()
        chrome_options.add_argument("user-data-dir=selenium")
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--ignore-certificate-errors')
        self.driver = webdriver.Chrome(
            ChromeDriverManager().install(),options=chrome_options)
        self.driver2 = webdriver.Chrome(
            ChromeDriverManager().install(),  options=chrome_options)

    def parse(self, link):

        self.driver.get(
            'https://gauchazh.clicrbs.com.br/search/?q=coronavirus')
        clickButtonNexPage = True

        
        while(clickButtonNexPage):
            try:
                nextPage_button = self.driver.find_element_by_xpath(
                    "//button[@class='btn-show-more']")
                self.driver.execute_script("arguments[0].click();", nextPage_button)
                time.sleep(2)
            except:
                clickButtonNexPage = False
                print('carregou todas as noticias')
                response = self.driver.find_element_by_xpath(
                    "//div[@class='search-results']").find_elements_by_tag_name('ul')[1].find_elements_by_tag_name('div')
                for item in self.get_data(response):
                    if self.check_exist_database(item['link']) == False:
                        print('NOVA ' + item['titulo'])
                        self.extract_html(item)
                pass
    def get_data(self, response):
        for li in response:
            if li.get_attribute('id'):
                yield {

                    'fonte': 'https://gauchazh.clicrbs.com.br',
                    'titulo': li.find_element_by_tag_name('a').find_element_by_class_name('m-headline').text,
                    'descricao': li.find_element_by_class_name('support-text').text,
                    'dia': li.find_element_by_tag_name('time').get_attribute('textContent').split()[0],
                    'link': li.find_element_by_tag_name('a').get_attribute('href'),
                    'noticia': None,
                    'tags': None
                }

    def extract_html(self, item):
        self.driver2.get(item['link'])
        tools = CustonTools()
        wordList = tools.get_key_word_list()
        tags = []
        formatedData = tools.format_dia(item['dia'])
        item['dia'] = formatedData
        try:
            print()
            print()
            print(item['link'])
            #html = self.driver2.find_element_by_xpath("//div[@class='article-content sa_incontent']").get_attribute('innerHTML')
            time.sleep(2)
            html = self.driver2.find_element_by_xpath("//div[@class='article-content sa_incontent']").get_attribute('innerHTML')
        except:
            print('falhou')
            return

        wordList = tools.get_key_word_list()
        tags = []
        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)

            if isWordInHtml == None:
                pass
            else:

                tags.append(word)
                #print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                #print('tags adicionadas ' + word)

        if not item['tags'] == None:
            #print('vai limpar o html')
            try:
                item['noticia'] = tools.cleanHTML(html)
                #print(item)
                #print('limpou noticia')
                #print('vai salvar no banco')
                self.save_to_database_novas(item)
                #print('chegou ao final do extract html')
            except:
                #print('erro na noticia: ' + item['link'])
                pass
        else:
            print('Noticia não possui tags ' + item['link'])
        return item

    def check_exist_database(self, titulo):
        self.databaseController = ApiRequest()
        return self.databaseController.check_exist_database(titulo)

    def save_to_database(self, item):
        self.databaseController = ApiRequest()
        self.databaseController.insert_to_database(item)
 def check_exist_database(self, titulo):
     self.databaseController = ApiRequest()
     return self.databaseController.check_exist_database(titulo)
Example #10
0
 def __init__(self):
     self.databaseController = ApiRequest()
     self.limit_time = False
     self.name_crawl = 'LOG FOLHA SP: '
Example #11
0
class QuotesSpider(scrapy.Spider):
    name = "get_folhasp"
    start_urls = [
        'http://search.folha.uol.com.br/search?q=coronavirus&site=todos',
        # 'http://search.folha.uol.com.br/search?q=coronavirus&site=todos&sr=301'
    ]

    def __init__(self):
        self.databaseController = ApiRequest()
        self.limit_time = False
        self.name_crawl = 'LOG FOLHA SP: '

    def parse(self, response):
        if self.limit_time:
            return print(self.name_crawl +
                         'Finalizou busca dentro do periodo de 1 mes')
        try:

            try:
                nextPage = response.xpath(
                    '//ul[@class="c-pagination__list"]').css(
                        'li.c-pagination__arrow')[1].xpath('a/@href').get()
            except:
                nextPage = response.xpath(
                    '//ul[@class="c-pagination__list"]').css(
                        'li.c-pagination__arrow')[0].xpath('a/@href').get()

            for item in get_data(response):
                try:
                    data_request = {'link': item['link']}

                    if not self.databaseController.make_request(
                            'check_exist_database', data_request):
                        yield scrapy.Request(item['link'],
                                             meta={"item": item},
                                             callback=self.extract_html)
                except:
                    pass

            print(nextPage)

            yield scrapy.Request(nextPage, callback=self.parse)
        except:
            print('finalizou')

    def extract_html(self, response):

        tools = CustonTools()
        tags = []
        item = response.meta["item"]

        formattedData = tools.format_data_folhasp(item['dia'])

        self.limit_time = tools.compare_dates(formattedData)

        item['dia'] = formattedData

        wordList = tools.get_key_word_list()

        try:
            html = response.xpath('//div[@class="c-news__body"]').get()
        except:
            html = response.xpath('//div[@class="c-news__content"]').get()

        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)
            isWordInTitulo = tools.check_word_in_html(word)(item['titulo'])

            if isWordInHtml is None and isWordInTitulo is None:
                pass
            else:

                tags.append(word)
                # print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                # print('tags adicionadas ' + word)

        if len(item['tags']) > 0:
            # print('vai limpar o html')
            try:
                html = tools.clean_html_class_folhasp(html)
                item['noticia'] = tools.cleanHTML(html)
                # return print(item['noticia'])
                # print('limpou noticia')
                print(self.name_crawl + 'NOVA ' + item['titulo'])
                # print(item['noticia'])
                # print('salvando')
                # print(item)
                if not self.databaseController.make_request('inserir', item):
                    print('Erro ao salvar no banco de dados')

                # print('chegou ao final do extract html')
            except Exception as ex:

                # print('erro na noticia: ' + item['link'])
                # print(ex)
                pass

        else:
            print(self.name_crawl + 'Noticia não possui tags ' + item['link'])

        yield item
    def __init__(self):

        self.databaseController = ApiRequest()
        self.limit_time = False
        self.name_crawl = 'LOG CORREIO 24 HORAS:'
class QuotesSpider(scrapy.Spider):
    name = "get_correio24horas"
    start_urls = [
        'https://www.correio24horas.com.br/resultado-de-pesquisa/pagina/1/busca/coronavirus/',
    ]

    def __init__(self):

        self.databaseController = ApiRequest()
        self.limit_time = False
        self.name_crawl = 'LOG CORREIO 24 HORAS:'

    def parse(self, response):

        if self.limit_time:
            return print(self.name_crawl +
                         'Finalizou busca dentro do periodo de 1 mes')

        nextPage = response.xpath(
            '//div[@class="pagination-responsivo--next"]/a/@href').get()

        print(self.name_crawl + 'Nextpage: ' + nextPage)
        for item in get_data(response):
            data_request = {'link': item['link']}
            if not self.databaseController.make_request(
                    'check_exist_database', data_request):
                yield scrapy.Request(item['link'],
                                     meta={"item": item},
                                     callback=self.extract_html)

        yield scrapy.Request(nextPage, callback=self.parse)

    def extract_html(self, response):
        tools = CustonTools()
        tags = []
        item = response.meta["item"]

        formatted_date = tools.format_dia(item['dia'])
        item['dia'] = formatted_date
        self.limit_time = tools.compare_dates(formatted_date)

        wordList = tools.get_key_word_list()
        item['descricao'] = response.xpath(
            '//div[@class="noticias-single__description visible-lg"]/text()'
        ).get()

        html = response.xpath('//div[@class="noticias-single__content"]').get()

        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)
            isWordInTitulo = tools.check_word_in_html(word)(item['titulo'])

            if isWordInHtml is None and isWordInTitulo is None:
                pass
            else:

                tags.append(word)
                # print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                # print('tags adicionadas ' + word)

        if not item['tags'] is None:
            # print('vai limpar o html')
            try:
                item['noticia'] = tools.cleanHTML(html)
                # print('limpou noticia')
                print(self.name_crawl + 'NOVA ' + item['titulo'])
                # print('vai salvar no banco')
                if not self.databaseController.make_request('inserir', item):
                    print(self.name_crawl + 'Erro ao salvar no banco de dados')

                # print('chegou ao final do extract html')
            except Exception as err:
                # print('erro na noticia: ' + item['link'])
                # print(err)
                pass
        else:
            # print('Noticia não possui tags ' + item['link'])
            pass

        yield item
class QuotesSpider(scrapy.Spider):
    name = 'get_g1'
    start_urls = [
        'https://scrapy.org',
    ]

    def __init__(self):
        self.databaseController = ApiRequest()
        self.arrayNoticias = []
        self.indexData = 0
        self.name_crawl = 'LOG G1: '
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--ignore-certificate-errors')
        #self.driver = webdriver.Chrome(
        #ChromeDriverManager().install(), options=chrome_options)
        #self.driver2 = webdriver.Chrome(
        #ChromeDriverManager().install(), options=chrome_options)

        self.driver = webdriver.Remote("http://*****:*****@class='fundo-cor-produto pagination__load-more']"
            ).get_attribute('href')
            # print('pegou link proxima pagina')

            response = self.driver.find_element_by_class_name(
                "results__list").find_elements_by_tag_name("li")

            for item in get_data(response):
                data_request = {'titulo': item['titulo']}
                if not self.databaseController.make_request(
                        'check_exist_database', data_request):
                    # print('\n|-- NOVA ' + item['titulo'])
                    self.extract_html(item)
                else:
                    # print('noticia já existe ')
                    pass

                    # print('\n\n PROXIMA PAGINA')
            print(self.name_crawl + 'next page ' + nextPage)
            self.parse(nextPage, dataIndex)

        except Exception as ex:
            print(ex)
            if dataIndex + 1 < len(self.dates):
                # print('proxima data ')
                print('\n\nDATA ' + self.dates[dataIndex]['dataInicial'] +
                      ' ' + self.dates[dataIndex]['dataFinal'])
                self.parse('https://g1.globo.com/busca/?q=coronavirus&page=1',
                           dataIndex + 1)

            else:
                print(self.name_crawl + 'finalizou ')

                #self.parse(
                #    'https://g1.globo.com/busca/?q=coronavirus&page=1', 0)

    def extract_html(self, item):
        tools = CustonTools()
        self.driver2.get(item['link'])
        # print(item['link'])
        try:
            time = self.driver2.find_element_by_tag_name('time').text
            item['dia'] = tools.format_dia(time.split(' ')[0])
            self.limit_time = tools.compare_dates(item['dia'])

            if self.limit_time:

                print(self.name_crawl + 'Noticia passou da data limite: ' +
                      item['link'])
                return

        except Exception as ex:
            item['dia'] = 'error_time'
            print(ex)
            pass

        wordList = tools.get_key_word_list()
        tags = []
        # print(item['dia'])

        try:
            html = self.driver2.find_element_by_tag_name(
                'article').get_attribute('innerHTML')
        except Exception as ex:
            print(ex)
            return

        # print('pegou html')

        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)

            if isWordInHtml is None:
                pass
            else:

                tags.append(word)
                # print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                # print('tags adicionadas ' + word)

        if not item['tags'] is None:
            # print('vai limpar o html')
            try:
                item['noticia'] = tools.cleanHTML(html)
                # print(item)
                # print('limpou noticia')
                # print('vai salvar no banco')
                print(self.name_crawl + 'NOVA ' + item['titulo'])

                if not self.databaseController.make_request('inserir', item):
                    print(self.name_crawl + 'Erro ao salvar no banco de dados')

                # print('chegou ao final do extract html')
            except:
                print(self.name_crawl + 'erro na noticia: ' + item['link'])
                pass
        else:
            print(self.name_crawl + 'Noticia não possui tags ' + item['link'])

        return item
class QuotesSpider(scrapy.Spider):
    name = "get_oglobo"
    start_urls = [
        'https://oglobo.globo.com/busca/?q=coronavirus',
    ]

    def parse(self, response):

        try:
            nextPage = response.xpath('//ul[@class="unstyled unbordered"]'
                                      ).css('li')[6].xpath('a/@href').get()

            for item in self.get_data(response):

                if self.check_exist_database(item['link']) == False:
                    #print(item['titulo'])
                    yield scrapy.Request(item['link'],
                                         meta={"item": item},
                                         callback=self.extract_html)

            #print(nextPage)

            yield scrapy.Request('https://oglobo.globo.com/busca/' + nextPage,
                                 callback=self.parse)
        except:
            print('finalizou')

    def get_data(self, response):

        #  x = response.xpath('//ul[@class="resultado_da_busca unstyled"]').xpath('li')[0]
        #  data = x.css('p')[0].css('span::text')[1].get()
        #  link x.css('a.cor-produto').xpath('@href').get()
        #  title x.css('a.cor-produto').xpath('@title').get()
        #  descricao x.css('p')[1].xpath('string(.)').get()

        for item in response.xpath(
                '//ul[@class="resultado_da_busca unstyled"]').xpath('li'):
            link = item.css('a.cor-produto').xpath('@href').get()
            link_uncoded = urllib.parse.parse_qs(link[2:])['u'][0]

            yield {
                'fonte':
                'https://oglobo.globo.com/',
                'titulo':
                str(item.css('a.cor-produto').xpath('@title').get()).replace(
                    '‘', '').replace('’', ''),
                'descricao':
                item.css('p')[1].xpath('string(.)').get(),
                'dia':
                item.css('p')[0].css('span::text')[1].get(),
                'link':
                link_uncoded,
                'noticia':
                None,
                'tags':
                None
            }

    def extract_html(self, response):
        tools = CustonTools()
        tags = []
        item = response.meta["item"]

        formatedData = tools.format_data_oglobo(item['dia'])

        item['dia'] = formatedData

        wordList = tools.get_key_word_list()

        html = response.xpath(
            '//div[@class="article__content-container protected-content"]'
        ).get()
        if (not html):
            print('pegou main')
            html = response.css('main').get()

        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)

            if isWordInHtml == None:
                pass
            else:

                tags.append(word)
                #print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                #print('tags adicionadas ' + word)

        if not item['tags'] == None:
            #print('vai limpar o html')
            try:
                html = tools.clean_html_class_oglobo(html)
                item['noticia'] = tools.cleanHTML(html)

                #self.save_to_database(item)
                #print('armazenou noticia ' + item['titulo'])
                #print('chegou ao final do extract html')
            except:

                print('erro na noticia: ' + item['link'])
                pass

        else:
            print('Noticia não possui tags ' + item['link'])

        yield item

    def check_exist_database(self, titulo):
        self.databaseController = ApiRequest()
        return self.databaseController.check_exist_item(titulo)

    def save_to_database(self, item):
        self.databaseController = ApiRequest()
        self.databaseController.insert_to_database_novas(item)
Example #16
0
class QuotesSpider(scrapy.Spider):
    name = "get_estadao"
    start_urls = [
        'https://busca.estadao.com.br/modulos/busca-resultado?modulo=busca-resultado&config[busca][page]=1&config['
        'busca][params]=tipo_conteudo%3DNot%25C3%25ADcias%26quando%3D%26q%3Dcoronavirus&ajax=1',
    ]

    def __init__(self):
        self.databaseController = ApiRequest()

        self.limit_time = False
        self.name_crawl = 'LOG ESTADAO: '

    def parse(self, response):
        if self.limit_time:
            return print(self.name_crawl +
                         'Finalizou busca dentro do periodo de 1 mes')
        try:

            nextPage = json.loads(
                response.xpath(
                    '//a[@class="go more-list-news btn-mais fn brd-e"]/@data-config'
                ).get())['busca']['page']

            nextPage: int = int(nextPage) + 1
            linkNextPage = 'https://busca.estadao.com.br/modulos/busca-resultado?modulo=busca-resultado&config[' \
                           'busca][page]={0}&config[busca][' \
                           'params]=tipo_conteudo%3DNot%25C3%25ADcias%26quando%3D%26q%3Dcoronavirus&ajax=1'.format(
                nextPage)
            # print(nextPage)

            for item in get_data(response):
                data_request = {'link': item['link']}

                if not self.databaseController.make_request(
                        'check_exist_database', data_request):
                    # print(item['titulo'])
                    yield scrapy.Request(item['link'],
                                         meta={"item": item},
                                         callback=self.extract_html)

            yield scrapy.Request(linkNextPage, callback=self.parse)
        except Exception as err:
            print(err)

    def extract_html(self, response):
        tools = CustonTools()
        tags = []
        item = response.meta["item"]

        if item['dia'] is None:
            dia = response.xpath(
                '//div[@class="n--noticia__state-desc"]/p/text()').get().split(
                    '|')[0][1:]

            item['dia'] = dia

        formattedData = tools.format_data_estadao(item['dia'])
        self.limit_time = tools.compare_dates(formattedData)

        item['dia'] = formattedData

        wordList = tools.get_key_word_list()

        html = response.xpath(
            '//div[@class="n--noticia__content content"]').get()

        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)
            isWordInTitulo = tools.check_word_in_html(word)(item['titulo'])

            if isWordInHtml is None and isWordInTitulo is None:
                pass
            else:

                tags.append(word)
                # print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                # print('tags adicionadas ' + word)

        if not item['tags'] is None:
            # print('vai limpar o html')
            try:
                item['noticia'] = tools.cleanHTML(html)
                # print('limpou noticia')
                print(self.name_crawl + 'NOVA ' + item['titulo'])
                # print('vai salvar no banco')
                if not self.databaseController.make_request('inserir', item):
                    print(self.name_crawl + 'Erro ao salvar no banco de dados')

                # print('chegou ao final do extract html')
            except Exception as err:
                print('erro na noticia: ' + item['link'])
                print(err)
                pass
        else:
            print('Noticia não possui tags ' + item['link'])
        yield item