def __init__(self): self.databaseController = ApiRequest() self.arrayNoticias = [] self.indexData = 0 self.name_crawl = 'LOG G1: ' chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--ignore-certificate-errors') #self.driver = webdriver.Chrome( #ChromeDriverManager().install(), options=chrome_options) #self.driver2 = webdriver.Chrome( #ChromeDriverManager().install(), options=chrome_options) self.driver = webdriver.Remote("http://localhost:4444/wd/hub", options=chrome_options) self.driver2 = webdriver.Remote("http://localhost:4444/wd/hub", options=chrome_options) self.dates = [ # {'dataInicial': str(ano) + '-01-01', 'dataFinal':str(ano) + '-01-31'}, # {'dataInicial': str(ano) + '-02-01', 'dataFinal':str(ano) + '-02-29'}, # {'dataInicial': str(ano) + '-03-01', 'dataFinal':str(ano) + '-03-31'}, # {'dataInicial': str(ano) + '-04-01', 'dataFinal':str(ano) + '-04-30'}, # {'dataInicial': str(ano) + '-05-01', 'dataFinal':str(ano) + '-05-31'}, # {'dataInicial': str(ano) + '-06-01', 'dataFinal': str(ano) + '-06-30'}, # {'dataInicial': str(ano) + '-07-01', # 'dataFinal': str(ano) + '-07-31'}, # {'dataInicial': str(ano) + '-08-01', # 'dataFinal': str(ano) + '-08-31'} # {'dataInicial': str(ano) + '-09-01', # 'dataFinal': str(ano) + '-09-30'}, # {'dataInicial': str(ano) + '-10-29', #'dataFinal': str(ano) + '-10-31'} # {'dataInicial': '2020-12-17', # 'dataFinal': str(ano) + '-01-04'} { 'dataInicial': str(data_inicial), 'dataFinal': str(data_final) } ] start_at = 0 print(self.name_crawl + 'DATA ' + self.dates[start_at]['dataInicial'] + ' ' + self.dates[start_at]['dataFinal']) self.parse('https://g1.globo.com/busca/?q=coronavirus&page=1', start_at)
def parse_page(self, response): databaseController = ApiRequest() item = response.meta["item"] data_request = {'link': item['link']} is_inDatabase = databaseController.make_request('check_exist_database', data_request) if is_inDatabase: return tools = CustonTools() wordList = tools.get_key_word_list() tags = [] formattedData = tools.format_dia(item['dia']) item['dia'] = formattedData self.limit_time = tools.compare_dates(formattedData) if self.limit_time: return print(self.name_crawl + 'Finalizou busca dentro do periodo de 1 mes') html = response.css('div.item-page').get() for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) if isWordInHtml is None: pass else: tags.append(word) item['noticia'] = tools.cleanHTML(html) item['tags'] = ','.join(str(tag) for tag in tags) print(self.name_crawl + 'NOVA ' + item['titulo']) if not databaseController.make_request('inserir', item): print(self.name_crawl + 'Erro ao salvar no banco de dados')
def __init__(self): self.databaseController = ApiRequest() self.limit_time = False
class QuotesSpider(scrapy.Spider): name = "get_correiobraziliense" start_urls = [ 'https://www.correiobraziliense.com.br/busca/coronavirus?json=63c055b-c8a7-4010-92c6-01803d6e752e&offset=0', ] def __init__(self): self.databaseController = ApiRequest() self.limit_time = False def parse(self, response): if self.limit_time: return print('Finalizou busca dentro do periodo de 1 mes') request = None try: request = requests.get(url=response.url) except requests.exceptions.RequestException as e: print(e) pass # extracting data in json format data = request.json() try: nextPage = data['next'] for item in get_data(data['news']): data_request = {'link': item['link']} if not self.databaseController.make_request( 'check_exist_database', data_request): print('NOVA ' + item['titulo']) yield scrapy.Request(item['link'], meta={"item": item}, callback=self.extract_html) print(nextPage) yield scrapy.Request(nextPage, callback=self.parse) except: print('finalizou') def extract_html(self, response): tools = CustonTools() tags = [] item = response.meta["item"] formatted_data = tools.format_dia(item['dia']) self.limit_time = tools.compare_dates(formatted_data) item['dia'] = formatted_data word_list = tools.get_key_word_list() try: html = response.xpath( '//div[@class="txt-serif js-article-box article-box article-box-capitalize mt-15"]' ).get() except: print('falhou ao obter html' + item['link']) for word in word_list: isWordInHtml = tools.check_word_in_html(word)(html) isWordInTitulo = tools.check_word_in_html(word)(item['titulo']) if isWordInHtml is None and isWordInTitulo is None: pass else: tags.append(word) # print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) # print('tags adicionadas ' + word) if not item['tags'] is None: # print('vai limpar o html') try: item['noticia'] = tools.cleanHTML(html) # print('limpou noticia') # print('vai salvar no banco') if not self.databaseController.make_request('inserir', item): print('Erro ao salvar no banco de dados') # print('chegou ao final do extract html') except Exception as ex: print('erro na noticia: ' + item['link']) print(ex) pass else: # print('Noticia não possui tags ' + item['link']) pass return item
def __init__(self, **kwargs): self.limit_time = False self.name_crawl = 'LOG OGLOBO:' self.databaseController = ApiRequest() self.link = 'https://oglobo.globo.com/api/v1/vermais/24219742/conteudo.json?pagina=0&versao=v1' \ '&tiposDeConteudo=materia,coluna,infografico,listaFatos,materiaEmCapitulos,linkExterno '
class QuotesSpider(scrapy.Spider): name = "get_oglobo2" start_urls = [ 'https://scrapy.org', ] def __init__(self, **kwargs): self.limit_time = False self.name_crawl = 'LOG OGLOBO:' self.databaseController = ApiRequest() self.link = 'https://oglobo.globo.com/api/v1/vermais/24219742/conteudo.json?pagina=0&versao=v1' \ '&tiposDeConteudo=materia,coluna,infografico,listaFatos,materiaEmCapitulos,linkExterno ' def parse(self, response): is_enabled = True while is_enabled and not self.limit_time: r = requests.get(self.link) # extracting data in json format data = r.json()[0] if len(data['conteudos']) > 0: parsed = urlparse.urlparse(data['paginacao']['urlProxima']) self.link = 'https://oglobo.globo.com/api/v1/vermais/24219742/conteudo.json?' + parsed.query for item in get_data(data['conteudos']): data_request = {'link': item['link']} if not self.databaseController.make_request( 'check_exist_database', data_request): yield scrapy.Request(item['link'], meta={"item": item}, callback=self.extract_html) print(self.name_crawl + 'nextPage ' + self.link) else: is_enabled = False print(self.name_crawl + 'Finalizou busca dentro do periodo de 1 mes') def extract_html(self, response): tools = CustonTools() tags = [] item = response.meta["item"] dia = response.xpath('//div[@class="article__date"]/text()').get( ).split(' ')[0].replace(' ', '').replace('\n', ' ').replace('\r', '') item['descricao'] = response.xpath( '//div[@class="article__subtitle"]/text()').get().replace( '\n', ' ').replace('\r', '') formattedData = tools.format_dia(dia) self.limit_time = tools.compare_dates(formattedData) item['dia'] = formattedData wordList = tools.get_key_word_list() html = response.xpath( '//div[@class="article__content-container protected-content"]' ).get() if not html: print('pegou main') html = response.css('main').get() for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) if isWordInHtml is None: pass else: tags.append(word) # print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) # print('tags adicionadas ' + word) if not item['tags'] is None: # print('vai limpar o html') try: html = tools.clean_html_class_oglobo(html) item['noticia'] = tools.cleanHTML(html) print(self.name_crawl + 'NOVA ' + item['titulo']) if not self.databaseController.make_request('inserir', item): print(self.name_crawl + 'Erro ao salvar no banco de dados') # print('armazenou noticia ' + item['titulo']) # print('chegou ao final do extract html') except Exception as ex: print('erro na noticia: ' + item['link']) print(ex) pass else: # print(self.name_crawl + 'Noticia não possui tags ' + item['link']) pass yield item
def save_to_database(self, item): self.databaseController = ApiRequest() self.databaseController.insert_to_database(item)
class QuotesSpider(scrapy.Spider): name = "get_gauchazh" start_urls = [ 'https://scrapy.org', ] def __init__(self): self.arrayNoticias = [] self.indexData = 0 chrome_options = Options() chrome_options.add_argument("user-data-dir=selenium") chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--ignore-certificate-errors') self.driver = webdriver.Chrome( ChromeDriverManager().install(),options=chrome_options) self.driver2 = webdriver.Chrome( ChromeDriverManager().install(), options=chrome_options) def parse(self, link): self.driver.get( 'https://gauchazh.clicrbs.com.br/search/?q=coronavirus') clickButtonNexPage = True while(clickButtonNexPage): try: nextPage_button = self.driver.find_element_by_xpath( "//button[@class='btn-show-more']") self.driver.execute_script("arguments[0].click();", nextPage_button) time.sleep(2) except: clickButtonNexPage = False print('carregou todas as noticias') response = self.driver.find_element_by_xpath( "//div[@class='search-results']").find_elements_by_tag_name('ul')[1].find_elements_by_tag_name('div') for item in self.get_data(response): if self.check_exist_database(item['link']) == False: print('NOVA ' + item['titulo']) self.extract_html(item) pass def get_data(self, response): for li in response: if li.get_attribute('id'): yield { 'fonte': 'https://gauchazh.clicrbs.com.br', 'titulo': li.find_element_by_tag_name('a').find_element_by_class_name('m-headline').text, 'descricao': li.find_element_by_class_name('support-text').text, 'dia': li.find_element_by_tag_name('time').get_attribute('textContent').split()[0], 'link': li.find_element_by_tag_name('a').get_attribute('href'), 'noticia': None, 'tags': None } def extract_html(self, item): self.driver2.get(item['link']) tools = CustonTools() wordList = tools.get_key_word_list() tags = [] formatedData = tools.format_dia(item['dia']) item['dia'] = formatedData try: print() print() print(item['link']) #html = self.driver2.find_element_by_xpath("//div[@class='article-content sa_incontent']").get_attribute('innerHTML') time.sleep(2) html = self.driver2.find_element_by_xpath("//div[@class='article-content sa_incontent']").get_attribute('innerHTML') except: print('falhou') return wordList = tools.get_key_word_list() tags = [] for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) if isWordInHtml == None: pass else: tags.append(word) #print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) #print('tags adicionadas ' + word) if not item['tags'] == None: #print('vai limpar o html') try: item['noticia'] = tools.cleanHTML(html) #print(item) #print('limpou noticia') #print('vai salvar no banco') self.save_to_database_novas(item) #print('chegou ao final do extract html') except: #print('erro na noticia: ' + item['link']) pass else: print('Noticia não possui tags ' + item['link']) return item def check_exist_database(self, titulo): self.databaseController = ApiRequest() return self.databaseController.check_exist_database(titulo) def save_to_database(self, item): self.databaseController = ApiRequest() self.databaseController.insert_to_database(item)
def check_exist_database(self, titulo): self.databaseController = ApiRequest() return self.databaseController.check_exist_database(titulo)
def __init__(self): self.databaseController = ApiRequest() self.limit_time = False self.name_crawl = 'LOG FOLHA SP: '
class QuotesSpider(scrapy.Spider): name = "get_folhasp" start_urls = [ 'http://search.folha.uol.com.br/search?q=coronavirus&site=todos', # 'http://search.folha.uol.com.br/search?q=coronavirus&site=todos&sr=301' ] def __init__(self): self.databaseController = ApiRequest() self.limit_time = False self.name_crawl = 'LOG FOLHA SP: ' def parse(self, response): if self.limit_time: return print(self.name_crawl + 'Finalizou busca dentro do periodo de 1 mes') try: try: nextPage = response.xpath( '//ul[@class="c-pagination__list"]').css( 'li.c-pagination__arrow')[1].xpath('a/@href').get() except: nextPage = response.xpath( '//ul[@class="c-pagination__list"]').css( 'li.c-pagination__arrow')[0].xpath('a/@href').get() for item in get_data(response): try: data_request = {'link': item['link']} if not self.databaseController.make_request( 'check_exist_database', data_request): yield scrapy.Request(item['link'], meta={"item": item}, callback=self.extract_html) except: pass print(nextPage) yield scrapy.Request(nextPage, callback=self.parse) except: print('finalizou') def extract_html(self, response): tools = CustonTools() tags = [] item = response.meta["item"] formattedData = tools.format_data_folhasp(item['dia']) self.limit_time = tools.compare_dates(formattedData) item['dia'] = formattedData wordList = tools.get_key_word_list() try: html = response.xpath('//div[@class="c-news__body"]').get() except: html = response.xpath('//div[@class="c-news__content"]').get() for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) isWordInTitulo = tools.check_word_in_html(word)(item['titulo']) if isWordInHtml is None and isWordInTitulo is None: pass else: tags.append(word) # print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) # print('tags adicionadas ' + word) if len(item['tags']) > 0: # print('vai limpar o html') try: html = tools.clean_html_class_folhasp(html) item['noticia'] = tools.cleanHTML(html) # return print(item['noticia']) # print('limpou noticia') print(self.name_crawl + 'NOVA ' + item['titulo']) # print(item['noticia']) # print('salvando') # print(item) if not self.databaseController.make_request('inserir', item): print('Erro ao salvar no banco de dados') # print('chegou ao final do extract html') except Exception as ex: # print('erro na noticia: ' + item['link']) # print(ex) pass else: print(self.name_crawl + 'Noticia não possui tags ' + item['link']) yield item
def __init__(self): self.databaseController = ApiRequest() self.limit_time = False self.name_crawl = 'LOG CORREIO 24 HORAS:'
class QuotesSpider(scrapy.Spider): name = "get_correio24horas" start_urls = [ 'https://www.correio24horas.com.br/resultado-de-pesquisa/pagina/1/busca/coronavirus/', ] def __init__(self): self.databaseController = ApiRequest() self.limit_time = False self.name_crawl = 'LOG CORREIO 24 HORAS:' def parse(self, response): if self.limit_time: return print(self.name_crawl + 'Finalizou busca dentro do periodo de 1 mes') nextPage = response.xpath( '//div[@class="pagination-responsivo--next"]/a/@href').get() print(self.name_crawl + 'Nextpage: ' + nextPage) for item in get_data(response): data_request = {'link': item['link']} if not self.databaseController.make_request( 'check_exist_database', data_request): yield scrapy.Request(item['link'], meta={"item": item}, callback=self.extract_html) yield scrapy.Request(nextPage, callback=self.parse) def extract_html(self, response): tools = CustonTools() tags = [] item = response.meta["item"] formatted_date = tools.format_dia(item['dia']) item['dia'] = formatted_date self.limit_time = tools.compare_dates(formatted_date) wordList = tools.get_key_word_list() item['descricao'] = response.xpath( '//div[@class="noticias-single__description visible-lg"]/text()' ).get() html = response.xpath('//div[@class="noticias-single__content"]').get() for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) isWordInTitulo = tools.check_word_in_html(word)(item['titulo']) if isWordInHtml is None and isWordInTitulo is None: pass else: tags.append(word) # print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) # print('tags adicionadas ' + word) if not item['tags'] is None: # print('vai limpar o html') try: item['noticia'] = tools.cleanHTML(html) # print('limpou noticia') print(self.name_crawl + 'NOVA ' + item['titulo']) # print('vai salvar no banco') if not self.databaseController.make_request('inserir', item): print(self.name_crawl + 'Erro ao salvar no banco de dados') # print('chegou ao final do extract html') except Exception as err: # print('erro na noticia: ' + item['link']) # print(err) pass else: # print('Noticia não possui tags ' + item['link']) pass yield item
class QuotesSpider(scrapy.Spider): name = 'get_g1' start_urls = [ 'https://scrapy.org', ] def __init__(self): self.databaseController = ApiRequest() self.arrayNoticias = [] self.indexData = 0 self.name_crawl = 'LOG G1: ' chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--ignore-certificate-errors') #self.driver = webdriver.Chrome( #ChromeDriverManager().install(), options=chrome_options) #self.driver2 = webdriver.Chrome( #ChromeDriverManager().install(), options=chrome_options) self.driver = webdriver.Remote("http://*****:*****@class='fundo-cor-produto pagination__load-more']" ).get_attribute('href') # print('pegou link proxima pagina') response = self.driver.find_element_by_class_name( "results__list").find_elements_by_tag_name("li") for item in get_data(response): data_request = {'titulo': item['titulo']} if not self.databaseController.make_request( 'check_exist_database', data_request): # print('\n|-- NOVA ' + item['titulo']) self.extract_html(item) else: # print('noticia já existe ') pass # print('\n\n PROXIMA PAGINA') print(self.name_crawl + 'next page ' + nextPage) self.parse(nextPage, dataIndex) except Exception as ex: print(ex) if dataIndex + 1 < len(self.dates): # print('proxima data ') print('\n\nDATA ' + self.dates[dataIndex]['dataInicial'] + ' ' + self.dates[dataIndex]['dataFinal']) self.parse('https://g1.globo.com/busca/?q=coronavirus&page=1', dataIndex + 1) else: print(self.name_crawl + 'finalizou ') #self.parse( # 'https://g1.globo.com/busca/?q=coronavirus&page=1', 0) def extract_html(self, item): tools = CustonTools() self.driver2.get(item['link']) # print(item['link']) try: time = self.driver2.find_element_by_tag_name('time').text item['dia'] = tools.format_dia(time.split(' ')[0]) self.limit_time = tools.compare_dates(item['dia']) if self.limit_time: print(self.name_crawl + 'Noticia passou da data limite: ' + item['link']) return except Exception as ex: item['dia'] = 'error_time' print(ex) pass wordList = tools.get_key_word_list() tags = [] # print(item['dia']) try: html = self.driver2.find_element_by_tag_name( 'article').get_attribute('innerHTML') except Exception as ex: print(ex) return # print('pegou html') for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) if isWordInHtml is None: pass else: tags.append(word) # print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) # print('tags adicionadas ' + word) if not item['tags'] is None: # print('vai limpar o html') try: item['noticia'] = tools.cleanHTML(html) # print(item) # print('limpou noticia') # print('vai salvar no banco') print(self.name_crawl + 'NOVA ' + item['titulo']) if not self.databaseController.make_request('inserir', item): print(self.name_crawl + 'Erro ao salvar no banco de dados') # print('chegou ao final do extract html') except: print(self.name_crawl + 'erro na noticia: ' + item['link']) pass else: print(self.name_crawl + 'Noticia não possui tags ' + item['link']) return item
class QuotesSpider(scrapy.Spider): name = "get_oglobo" start_urls = [ 'https://oglobo.globo.com/busca/?q=coronavirus', ] def parse(self, response): try: nextPage = response.xpath('//ul[@class="unstyled unbordered"]' ).css('li')[6].xpath('a/@href').get() for item in self.get_data(response): if self.check_exist_database(item['link']) == False: #print(item['titulo']) yield scrapy.Request(item['link'], meta={"item": item}, callback=self.extract_html) #print(nextPage) yield scrapy.Request('https://oglobo.globo.com/busca/' + nextPage, callback=self.parse) except: print('finalizou') def get_data(self, response): # x = response.xpath('//ul[@class="resultado_da_busca unstyled"]').xpath('li')[0] # data = x.css('p')[0].css('span::text')[1].get() # link x.css('a.cor-produto').xpath('@href').get() # title x.css('a.cor-produto').xpath('@title').get() # descricao x.css('p')[1].xpath('string(.)').get() for item in response.xpath( '//ul[@class="resultado_da_busca unstyled"]').xpath('li'): link = item.css('a.cor-produto').xpath('@href').get() link_uncoded = urllib.parse.parse_qs(link[2:])['u'][0] yield { 'fonte': 'https://oglobo.globo.com/', 'titulo': str(item.css('a.cor-produto').xpath('@title').get()).replace( '‘', '').replace('’', ''), 'descricao': item.css('p')[1].xpath('string(.)').get(), 'dia': item.css('p')[0].css('span::text')[1].get(), 'link': link_uncoded, 'noticia': None, 'tags': None } def extract_html(self, response): tools = CustonTools() tags = [] item = response.meta["item"] formatedData = tools.format_data_oglobo(item['dia']) item['dia'] = formatedData wordList = tools.get_key_word_list() html = response.xpath( '//div[@class="article__content-container protected-content"]' ).get() if (not html): print('pegou main') html = response.css('main').get() for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) if isWordInHtml == None: pass else: tags.append(word) #print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) #print('tags adicionadas ' + word) if not item['tags'] == None: #print('vai limpar o html') try: html = tools.clean_html_class_oglobo(html) item['noticia'] = tools.cleanHTML(html) #self.save_to_database(item) #print('armazenou noticia ' + item['titulo']) #print('chegou ao final do extract html') except: print('erro na noticia: ' + item['link']) pass else: print('Noticia não possui tags ' + item['link']) yield item def check_exist_database(self, titulo): self.databaseController = ApiRequest() return self.databaseController.check_exist_item(titulo) def save_to_database(self, item): self.databaseController = ApiRequest() self.databaseController.insert_to_database_novas(item)
class QuotesSpider(scrapy.Spider): name = "get_estadao" start_urls = [ 'https://busca.estadao.com.br/modulos/busca-resultado?modulo=busca-resultado&config[busca][page]=1&config[' 'busca][params]=tipo_conteudo%3DNot%25C3%25ADcias%26quando%3D%26q%3Dcoronavirus&ajax=1', ] def __init__(self): self.databaseController = ApiRequest() self.limit_time = False self.name_crawl = 'LOG ESTADAO: ' def parse(self, response): if self.limit_time: return print(self.name_crawl + 'Finalizou busca dentro do periodo de 1 mes') try: nextPage = json.loads( response.xpath( '//a[@class="go more-list-news btn-mais fn brd-e"]/@data-config' ).get())['busca']['page'] nextPage: int = int(nextPage) + 1 linkNextPage = 'https://busca.estadao.com.br/modulos/busca-resultado?modulo=busca-resultado&config[' \ 'busca][page]={0}&config[busca][' \ 'params]=tipo_conteudo%3DNot%25C3%25ADcias%26quando%3D%26q%3Dcoronavirus&ajax=1'.format( nextPage) # print(nextPage) for item in get_data(response): data_request = {'link': item['link']} if not self.databaseController.make_request( 'check_exist_database', data_request): # print(item['titulo']) yield scrapy.Request(item['link'], meta={"item": item}, callback=self.extract_html) yield scrapy.Request(linkNextPage, callback=self.parse) except Exception as err: print(err) def extract_html(self, response): tools = CustonTools() tags = [] item = response.meta["item"] if item['dia'] is None: dia = response.xpath( '//div[@class="n--noticia__state-desc"]/p/text()').get().split( '|')[0][1:] item['dia'] = dia formattedData = tools.format_data_estadao(item['dia']) self.limit_time = tools.compare_dates(formattedData) item['dia'] = formattedData wordList = tools.get_key_word_list() html = response.xpath( '//div[@class="n--noticia__content content"]').get() for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) isWordInTitulo = tools.check_word_in_html(word)(item['titulo']) if isWordInHtml is None and isWordInTitulo is None: pass else: tags.append(word) # print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) # print('tags adicionadas ' + word) if not item['tags'] is None: # print('vai limpar o html') try: item['noticia'] = tools.cleanHTML(html) # print('limpou noticia') print(self.name_crawl + 'NOVA ' + item['titulo']) # print('vai salvar no banco') if not self.databaseController.make_request('inserir', item): print(self.name_crawl + 'Erro ao salvar no banco de dados') # print('chegou ao final do extract html') except Exception as err: print('erro na noticia: ' + item['link']) print(err) pass else: print('Noticia não possui tags ' + item['link']) yield item