def scrap(self, termos, data_inicio, data_fim): driver = chrome.inicializar_driver(self.download_dir) search_url = self.search_page_url + 'q=' + termos.replace('+', ' ') logger.info(search_url) driver.get(search_url) logger.info('Iniciando raspagem dos resultados') try: next_page = True page_count = 0 while next_page: driver.switch_to_active_element() WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.CLASS_NAME, 'topics-sec-block'))) links = driver.find_elements(By.CLASS_NAME, 'topics-sec-item') for link in links: # logger.debug(link.get_attribute('innerHTML')) data = link.find_element( By.CLASS_NAME, 'humanize-datetime').get_attribute('data-modifieddate') data = data[0:data.find('T')] href = link.find_elements(By.TAG_NAME, 'a')[1].get_attribute('href') data_data = dt.strptime(data, '%Y-%m-%d') if dt.strptime(data_inicio, '%Y-%m-%d') <= data_data <= dt.strptime( data_fim, '%Y-%m-%d'): self.treat_link(href, data) page_count += 1 logger.info( 'Terminou a pagina {}. Indo para a próxima...'.format( page_count)) try: next_link = driver.find_element( By.CLASS_NAME, 'search-result-pagination') next_link = next_link.find_element(By.CLASS_NAME, 'next-page')\ .find_element(By.TAG_NAME, 'a').get_attribute('onclick') logger.debug(next_link) driver.execute_script(next_link) except NoSuchElementException as ex: next_page = False logger.error(ex) pass except ElementClickInterceptedException as ex: next_page = False logger.error(ex) pass finally: driver.close()
def scrap(self, termos, data_inicio, data_fim): driver = chrome.inicializar_driver(self.download_dir) search_url = self.search_page_url + 'q=' + termos + '&' \ 'periodo=personalizado' \ '&sd='+data_inicio+'&ed='+data_fim+'&site=sitefolha' logger.info(search_url) driver.get(search_url) logger.info('Iniciando raspagem do resultado') try: next_page = True page_count = 1 while next_page: logger.info( '=== Inicializando raspagem na página {} ==== '.format( page_count)) WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.CLASS_NAME, "c-search"))) lista_links = driver.find_element(By.TAG_NAME, 'ol') links = lista_links.find_elements(By.TAG_NAME, 'li') for link in links: a_tag = link.find_element(By.TAG_NAME, 'a') href = a_tag.get_attribute('href') logger.info('=== href: {}'.format(href)) self.treat_link(href) try: arrows = driver.find_elements(By.CLASS_NAME, 'c-pagination__arrows') if len(arrows) == 1: arrow_index = 0 elif len(arrows) > 1: arrow_index = 1 else: break next_page = False arrow_count = 0 for arrow in arrows: if arrow_count == arrow_index: arrow.click() arrow_count += 1 except NoSuchElementException as ex: next_page = False logger.error(ex) pass page_count += 1 except Exception as ex: logger.exception(ex)
def scrap(self, qparams): driver = chrome.inicializar_driver(self.download_dir) search_url = self.search_page_url for k, v in qparams.items(): search_url = search_url + k + '=' + v + '&' print(search_url) driver.get(search_url) print('Carregando todos os resultados na página de busca...') try: el = driver.find_element( By.XPATH, '//*[@id="site-content"]/div/div[2]/div[2]/div/button') except NoSuchElementException as ex: print('Terminou de carregar página de busca') el = None pass click_counter = 0 while el is not None: print(el.get_attribute('innerHTML')) el.click() click_counter += 1 print('click_counter = {0}'.format(click_counter)) time.sleep(1) try: el = driver.find_element( By.XPATH, '//*[@id="site-content"]/div/div[2]/div[2]/div/button') except NoSuchElementException as ex: print('Terminou de carregar página de busca') el = None pass print('Iniciando raspagem dos resultados') links = driver.find_elements(By.CLASS_NAME, 'css-1l4w6pd') try: for link in links: html_text = link.find_element( By.CLASS_NAME, 'css-2fgx4k').get_attribute('innerHTML') a_tag = link.find_element(By.TAG_NAME, 'a') href = a_tag.get_attribute('href') self.treat_link(href) finally: driver.close()
def treat_link(self, href, data): logger.info(href) driver = chrome.inicializar_driver(self.download_dir) driver.get(href) try: artigo_el = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.ID, 'orb-modules'))) site_container = artigo_el.find_element(By.ID, 'site-container') logger.info(artigo_el) timestamp_attr = data logger.info(timestamp_attr) site_container = artigo_el.find_element(By.ID, 'page') # logger.debug(site_container.get_attribute('innerHTML')) container = site_container.find_element(By.CLASS_NAME, 'container').find_element( By.CLASS_NAME, 'column-clearfix') column = container.find_element(By.CLASS_NAME, 'column--primary') divs = column.find_elements(By.TAG_NAME, 'p') materia = '' for div in divs: try: paragrafo = div.get_attribute('innerHTML') #logger.info(paragrafo) paragrafo = re.sub(r'<span>', '', paragrafo) paragrafo = re.sub(r'</span>', '', paragrafo) materia = materia + paragrafo except NoSuchElementException as ex: raise logger.debug(materia) href = re.sub(r'/', '-', href) href = re.sub(r'\.', '-', href) filename = re.sub(r':', '-', href) + re.sub( r':', '-', timestamp_attr) + href filename_path = os.path.join(self.download_dir, filename) logger.info('***FILENAME: {}***'.format(filename)) with open(filename_path, 'w') as f: f.write(href) f.write(materia) f.flush() f.close() except TimeoutException as ex: logger.debug('TimeoutException') logger.debug(ex) pass except NoSuchElementException as ex: logger.debug(ex) pass finally: driver.close()
def scrap(qparams): driver = chrome.inicializar_driver() search_url = _search_page_url for k, v in qparams.items(): search_url = search_url+k+'='+v+'&' driver.get('_search_url{0}'.format()) html = driver.get_attribute(By.TAG, 'body') print(html)
def scrap(self, termos, data_inicio, data_fim, max_paginas): driver = chrome.inicializar_driver(self.download_dir) search_url = self.search_page_url + 'q=' + termos + '&filter=news' logger.info(search_url) driver.get(search_url) logger.info('Iniciando raspagem dos resultados') try: next_page = True while next_page: WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.CLASS_NAME, 'search-results'))) results = driver.find_elements(By.CLASS_NAME, 'search-results') for result in results: links = result.find_elements(By.TAG_NAME, 'li') logger.info('links {0}'.format(links)) for link in links: #logger.debug(link.get_attribute('innerHTML')) data = link.find_element(By.TAG_NAME, 'time').get_attribute('datetime') data = data[0:data.find('T')] logger.info(data) href = link.find_elements(By.TAG_NAME, 'a')[0].get_attribute('href') #logger.info(href) data_data = dt.strptime(data, '%Y-%m-%d') if dt.strptime(data_inicio, '%Y-%m-%d') <= data_data <= dt.strptime( data_fim, '%Y-%m-%d'): self.treat_link(href, data) else: logger.debug('data nao passou {0} '.format(data)) except Exception as ex: raise finally: driver.close()
def treat_link(self, href, data): logger.info(href) driver = chrome.inicializar_driver(self.download_dir) driver.get(href) try: time_el = WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.CLASS_NAME, 'timeagofunction'))) logger.info(time_el) timestamp_attr = data logger.info(timestamp_attr) artigo_el = WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.CLASS_NAME, 'article-p-wrapper'))) divs = artigo_el.find_elements(By.TAG_NAME, 'p') materia = '' for div in divs: try: paragrafo = div.get_attribute('innerHTML') # logger.info(paragrafo) paragrafo = re.sub(r'<span>', '', paragrafo) paragrafo = re.sub(r'</span>', '', paragrafo) materia = materia + paragrafo except NoSuchElementException as ex: raise #logger.debug(materia) href = re.sub(r'/', '-', href) href = re.sub(r'\.', '-', href) filename = re.sub(r':', '-', href) + re.sub( r':', '-', timestamp_attr) + href filename_path = os.path.join(self.download_dir, filename) logger.info('***FILENAME: {}***'.format(filename)) with open(filename_path, 'w') as f: f.write(href) f.write(materia) f.flush() f.close() except TimeoutException as ex: logger.debug('TimeoutException') pass finally: driver.close()
def treat_link(self, href): logger.info(href) driver = chrome.inicializar_driver(self.download_dir) driver.get(href) try: time_x_path = '/html/body/div[1]/div/div/div[2]/main/div/article/div[3]/header/div[5]/ul/li[1]/div/time' time_el = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.XPATH, time_x_path))) timestamp_attr = time_el.get_attribute('datetime') logger.info(timestamp_attr) artigo_x_path = '/html/body/div[1]/div/div/div[2]/main/div/article/section' artigo_el = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.XPATH, artigo_x_path))) divs = artigo_el.find_elements(By.TAG_NAME, 'div') materia = '' for div in divs: try: div_paragrafos = div.find_element(By.TAG_NAME, 'div') paragrafos = div_paragrafos.find_elements(By.TAG_NAME, 'p') for paragrafo in paragrafos: materia = materia + paragrafo.get_attribute('innerHTML') except NoSuchElementException as ex: pass print(materia) filename = re.sub(r':', '-', timestamp_attr) filename_path = os.path.join(self.download_dir, filename) print('***FILENAME: {}***'.format(filename)) with open(filename_path, 'w') as f: f.write(href) f.write(materia) f.flush() f.close() except TimeoutException as ex: pass finally: driver.close()
def treat_link(self, href): logger.info('=== Acessando {} ==='.format(href)) driver = chrome.inicializar_driver(self.download_dir, headless=False, habilitar_javascript=False) try: driver.get(href) data_el = WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.CLASS_NAME, 'c-more-options__header'))) data_el = data_el.find_element(By.CLASS_NAME, "c-more-options__published-date") try: logger.debug('=== innerHtml: {} ==='.format( data_el.get_attribute('innerHTML'))) except Exception as ex: data_el = data_el.find_element( By.CLASS_NAME, "c-more-options__published-date") pass data = data_el.get_attribute('datetime') data = dt.strptime(data, '%Y-%m-%d %H:%M:%S') logger.info('=== DATA {} ==='.format(data)) conteudo = None try: conteudo = driver.find_element(By.CLASS_NAME, 'c-news__content') except NoSuchElementException as nse: logger.exception(nse) try: conteudo = conteudo.find_element(By.CLASS_NAME, 'c-news__body') except NoSuchElementException as nxe: logger.exception(nxe) raise pass paragrafos = conteudo.find_elements(By.TAG_NAME, 'p') logger.info('=== Numero de parágrafos: {} ==='.format( len(paragrafos))) materia = '' for p in paragrafos: materia = materia + p.get_attribute('innerHTML') filename = re.sub(r'/', '-', href) + re.sub( r':', '-', data.strftime('%Y-%m-%d %H:%M:%S')) filename_path = os.path.join(self.download_dir, filename) logger.info('***FILENAME: {}***'.format(filename)) with open(filename_path, 'w') as f: f.write(href) f.write(materia) f.flush() f.close() except Exception as ex: logger.exception(ex) finally: driver.close()
def scrap(self, termos, data_inicio, data_fim, max_paginas=50): driver = chrome.inicializar_driver(self.download_dir) search_url = self.search_page_url + 'q=' + termos + '&filter=news' logger.info(search_url) driver.get(search_url) logger.info('Iniciando raspagem dos resultados') try: next_page = True page_count = 1 while next_page and page_count < max_paginas: logger.info('=== Acessando página {} ==='.format(page_count)) results = WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.CLASS_NAME, 'css-1v7bxtk-StyledContainer'))) links_list = results.find_elements(By.CLASS_NAME, 'css-2e0sc2-Promo') logger.debug('=== QTDE LINKS: {} ==='.format(len(links_list))) link_counter = 0 for link in links_list: link_counter += 1 logger.debug( '=== Acessando link n_o {} ==='.format(link_counter)) logger.info(link) date_div = link.find_element( By.CLASS_NAME, 'css-1hizfh0-MetadataSnippet') date_spans = date_div.find_elements(By.TAG_NAME, 'span') data = None for span in date_spans: logger.info(span.get_attribute('innerHTML')) span_content = span.get_attribute('innerHTML') if len(span_content) == 11 or len(span_content) == 10: data = span_content if data is None: break try: data_data = dt.strptime(data, '%d %b %Y') logger.info('=== DATA {} ==='.format(data_data)) href = link.find_elements(By.TAG_NAME, 'a')[0].get_attribute('href') if dt.strptime(data_inicio, '%Y-%m-%d') <= data_data <= dt.strptime( data_fim, '%Y-%m-%d'): try: self.treat_link(href, data) except Exception as ex: pass else: logger.debug('data nao passou {0} '.format(data)) except ValueError as vex: pass if link_counter >= 10: page_count += 1 url_proxima_pagina = search_url + '&page={}'.format( page_count) driver.get(url_proxima_pagina) else: next_page = False except Exception as ex: raise finally: driver.close()