Ejemplo n.º 1
0
    def scrap(self, termos, data_inicio, data_fim):

        driver = chrome.inicializar_driver(self.download_dir)

        search_url = self.search_page_url + 'q=' + termos.replace('+', ' ')

        logger.info(search_url)

        driver.get(search_url)

        logger.info('Iniciando raspagem dos resultados')

        try:
            next_page = True
            page_count = 0
            while next_page:

                driver.switch_to_active_element()

                WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located(
                        (By.CLASS_NAME, 'topics-sec-block')))

                links = driver.find_elements(By.CLASS_NAME, 'topics-sec-item')
                for link in links:
                    # logger.debug(link.get_attribute('innerHTML'))
                    data = link.find_element(
                        By.CLASS_NAME,
                        'humanize-datetime').get_attribute('data-modifieddate')
                    data = data[0:data.find('T')]
                    href = link.find_elements(By.TAG_NAME,
                                              'a')[1].get_attribute('href')
                    data_data = dt.strptime(data, '%Y-%m-%d')
                    if dt.strptime(data_inicio,
                                   '%Y-%m-%d') <= data_data <= dt.strptime(
                                       data_fim, '%Y-%m-%d'):
                        self.treat_link(href, data)
                page_count += 1
                logger.info(
                    'Terminou a pagina {}. Indo para a próxima...'.format(
                        page_count))
                try:
                    next_link = driver.find_element(
                        By.CLASS_NAME, 'search-result-pagination')
                    next_link = next_link.find_element(By.CLASS_NAME, 'next-page')\
                        .find_element(By.TAG_NAME, 'a').get_attribute('onclick')
                    logger.debug(next_link)
                    driver.execute_script(next_link)
                except NoSuchElementException as ex:
                    next_page = False
                    logger.error(ex)
                    pass
                except ElementClickInterceptedException as ex:
                    next_page = False
                    logger.error(ex)
                    pass
        finally:
            driver.close()
Ejemplo n.º 2
0
    def scrap(self, termos, data_inicio, data_fim):

        driver = chrome.inicializar_driver(self.download_dir)

        search_url = self.search_page_url + 'q=' + termos + '&' \
                                                            'periodo=personalizado' \
                                                            '&sd='+data_inicio+'&ed='+data_fim+'&site=sitefolha'

        logger.info(search_url)

        driver.get(search_url)

        logger.info('Iniciando raspagem do resultado')

        try:
            next_page = True
            page_count = 1
            while next_page:
                logger.info(
                    '=== Inicializando raspagem na página {} ==== '.format(
                        page_count))
                WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located(
                        (By.CLASS_NAME, "c-search")))
                lista_links = driver.find_element(By.TAG_NAME, 'ol')
                links = lista_links.find_elements(By.TAG_NAME, 'li')

                for link in links:
                    a_tag = link.find_element(By.TAG_NAME, 'a')
                    href = a_tag.get_attribute('href')
                    logger.info('=== href: {}'.format(href))
                    self.treat_link(href)
                try:
                    arrows = driver.find_elements(By.CLASS_NAME,
                                                  'c-pagination__arrows')
                    if len(arrows) == 1:
                        arrow_index = 0
                    elif len(arrows) > 1:
                        arrow_index = 1
                    else:
                        break
                        next_page = False
                    arrow_count = 0
                    for arrow in arrows:
                        if arrow_count == arrow_index:
                            arrow.click()
                        arrow_count += 1
                except NoSuchElementException as ex:
                    next_page = False
                    logger.error(ex)
                    pass
                page_count += 1

        except Exception as ex:
            logger.exception(ex)
Ejemplo n.º 3
0
    def scrap(self, qparams):

        driver = chrome.inicializar_driver(self.download_dir)

        search_url = self.search_page_url

        for k, v in qparams.items():
            search_url = search_url + k + '=' + v + '&'

        print(search_url)

        driver.get(search_url)

        print('Carregando todos os resultados na página de busca...')

        try:
            el = driver.find_element(
                By.XPATH,
                '//*[@id="site-content"]/div/div[2]/div[2]/div/button')
        except NoSuchElementException as ex:
            print('Terminou de carregar página de busca')
            el = None
            pass

        click_counter = 0
        while el is not None:
            print(el.get_attribute('innerHTML'))
            el.click()
            click_counter += 1
            print('click_counter =  {0}'.format(click_counter))
            time.sleep(1)
            try:
                el = driver.find_element(
                    By.XPATH,
                    '//*[@id="site-content"]/div/div[2]/div[2]/div/button')
            except NoSuchElementException as ex:
                print('Terminou de carregar página de busca')
                el = None
                pass

        print('Iniciando raspagem dos resultados')

        links = driver.find_elements(By.CLASS_NAME, 'css-1l4w6pd')

        try:
            for link in links:
                html_text = link.find_element(
                    By.CLASS_NAME, 'css-2fgx4k').get_attribute('innerHTML')
                a_tag = link.find_element(By.TAG_NAME, 'a')
                href = a_tag.get_attribute('href')
                self.treat_link(href)
        finally:
            driver.close()
Ejemplo n.º 4
0
 def treat_link(self, href, data):
     logger.info(href)
     driver = chrome.inicializar_driver(self.download_dir)
     driver.get(href)
     try:
         artigo_el = WebDriverWait(driver, 30).until(
             EC.presence_of_element_located((By.ID, 'orb-modules')))
         site_container = artigo_el.find_element(By.ID, 'site-container')
         logger.info(artigo_el)
         timestamp_attr = data
         logger.info(timestamp_attr)
         site_container = artigo_el.find_element(By.ID, 'page')
         # logger.debug(site_container.get_attribute('innerHTML'))
         container = site_container.find_element(By.CLASS_NAME,
                                                 'container').find_element(
                                                     By.CLASS_NAME,
                                                     'column-clearfix')
         column = container.find_element(By.CLASS_NAME, 'column--primary')
         divs = column.find_elements(By.TAG_NAME, 'p')
         materia = ''
         for div in divs:
             try:
                 paragrafo = div.get_attribute('innerHTML')
                 #logger.info(paragrafo)
                 paragrafo = re.sub(r'<span>', '', paragrafo)
                 paragrafo = re.sub(r'</span>', '', paragrafo)
                 materia = materia + paragrafo
             except NoSuchElementException as ex:
                 raise
         logger.debug(materia)
         href = re.sub(r'/', '-', href)
         href = re.sub(r'\.', '-', href)
         filename = re.sub(r':', '-', href) + re.sub(
             r':', '-', timestamp_attr) + href
         filename_path = os.path.join(self.download_dir, filename)
         logger.info('***FILENAME: {}***'.format(filename))
         with open(filename_path, 'w') as f:
             f.write(href)
             f.write(materia)
             f.flush()
             f.close()
     except TimeoutException as ex:
         logger.debug('TimeoutException')
         logger.debug(ex)
         pass
     except NoSuchElementException as ex:
         logger.debug(ex)
         pass
     finally:
         driver.close()
Ejemplo n.º 5
0
def scrap(qparams):

    driver = chrome.inicializar_driver()

    search_url = _search_page_url

    for k, v in qparams.items():
        search_url = search_url+k+'='+v+'&'

    driver.get('_search_url{0}'.format())

    html = driver.get_attribute(By.TAG, 'body')

    print(html)
Ejemplo n.º 6
0
    def scrap(self, termos, data_inicio, data_fim, max_paginas):

        driver = chrome.inicializar_driver(self.download_dir)

        search_url = self.search_page_url + 'q=' + termos + '&filter=news'

        logger.info(search_url)

        driver.get(search_url)

        logger.info('Iniciando raspagem dos resultados')

        try:
            next_page = True
            while next_page:
                WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located(
                        (By.CLASS_NAME, 'search-results')))

            results = driver.find_elements(By.CLASS_NAME, 'search-results')

            for result in results:

                links = result.find_elements(By.TAG_NAME, 'li')

                logger.info('links {0}'.format(links))
                for link in links:
                    #logger.debug(link.get_attribute('innerHTML'))
                    data = link.find_element(By.TAG_NAME,
                                             'time').get_attribute('datetime')
                    data = data[0:data.find('T')]
                    logger.info(data)
                    href = link.find_elements(By.TAG_NAME,
                                              'a')[0].get_attribute('href')
                    #logger.info(href)
                    data_data = dt.strptime(data, '%Y-%m-%d')
                    if dt.strptime(data_inicio,
                                   '%Y-%m-%d') <= data_data <= dt.strptime(
                                       data_fim, '%Y-%m-%d'):
                        self.treat_link(href, data)
                    else:
                        logger.debug('data nao passou {0} '.format(data))
        except Exception as ex:
            raise
        finally:
            driver.close()
Ejemplo n.º 7
0
 def treat_link(self, href, data):
     logger.info(href)
     driver = chrome.inicializar_driver(self.download_dir)
     driver.get(href)
     try:
         time_el = WebDriverWait(driver, 30).until(
             EC.presence_of_element_located(
                 (By.CLASS_NAME, 'timeagofunction')))
         logger.info(time_el)
         timestamp_attr = data
         logger.info(timestamp_attr)
         artigo_el = WebDriverWait(driver, 30).until(
             EC.presence_of_element_located(
                 (By.CLASS_NAME, 'article-p-wrapper')))
         divs = artigo_el.find_elements(By.TAG_NAME, 'p')
         materia = ''
         for div in divs:
             try:
                 paragrafo = div.get_attribute('innerHTML')
                 # logger.info(paragrafo)
                 paragrafo = re.sub(r'<span>', '', paragrafo)
                 paragrafo = re.sub(r'</span>', '', paragrafo)
                 materia = materia + paragrafo
             except NoSuchElementException as ex:
                 raise
         #logger.debug(materia)
         href = re.sub(r'/', '-', href)
         href = re.sub(r'\.', '-', href)
         filename = re.sub(r':', '-', href) + re.sub(
             r':', '-', timestamp_attr) + href
         filename_path = os.path.join(self.download_dir, filename)
         logger.info('***FILENAME: {}***'.format(filename))
         with open(filename_path, 'w') as f:
             f.write(href)
             f.write(materia)
             f.flush()
             f.close()
     except TimeoutException as ex:
         logger.debug('TimeoutException')
         pass
     finally:
         driver.close()
Ejemplo n.º 8
0
 def treat_link(self, href):
     logger.info(href)
     driver = chrome.inicializar_driver(self.download_dir)
     driver.get(href)
     try:
         time_x_path = '/html/body/div[1]/div/div/div[2]/main/div/article/div[3]/header/div[5]/ul/li[1]/div/time'
         time_el = WebDriverWait(driver, 30).until(
             EC.presence_of_element_located((By.XPATH, time_x_path)))
         timestamp_attr = time_el.get_attribute('datetime')
         logger.info(timestamp_attr)
         artigo_x_path = '/html/body/div[1]/div/div/div[2]/main/div/article/section'
         artigo_el = WebDriverWait(driver, 30).until(
             EC.presence_of_element_located((By.XPATH, artigo_x_path)))
         divs = artigo_el.find_elements(By.TAG_NAME, 'div')
         materia = ''
         for div in divs:
             try:
                 div_paragrafos = div.find_element(By.TAG_NAME, 'div')
                 paragrafos = div_paragrafos.find_elements(By.TAG_NAME, 'p')
                 for paragrafo in paragrafos:
                     materia = materia + paragrafo.get_attribute('innerHTML')
             except NoSuchElementException as ex:
                 pass
         print(materia)
         filename = re.sub(r':', '-', timestamp_attr)
         filename_path = os.path.join(self.download_dir, filename)
         print('***FILENAME: {}***'.format(filename))
         with open(filename_path, 'w') as f:
             f.write(href)
             f.write(materia)
             f.flush()
             f.close()
     except TimeoutException as ex:
         pass
     finally:
         driver.close()
Ejemplo n.º 9
0
    def treat_link(self, href):
        logger.info('=== Acessando {} ==='.format(href))
        driver = chrome.inicializar_driver(self.download_dir,
                                           headless=False,
                                           habilitar_javascript=False)
        try:
            driver.get(href)

            data_el = WebDriverWait(driver, 30).until(
                EC.presence_of_element_located(
                    (By.CLASS_NAME, 'c-more-options__header')))

            data_el = data_el.find_element(By.CLASS_NAME,
                                           "c-more-options__published-date")

            try:
                logger.debug('=== innerHtml: {} ==='.format(
                    data_el.get_attribute('innerHTML')))
            except Exception as ex:
                data_el = data_el.find_element(
                    By.CLASS_NAME, "c-more-options__published-date")
                pass

            data = data_el.get_attribute('datetime')

            data = dt.strptime(data, '%Y-%m-%d %H:%M:%S')
            logger.info('=== DATA {} ==='.format(data))
            conteudo = None
            try:
                conteudo = driver.find_element(By.CLASS_NAME,
                                               'c-news__content')
            except NoSuchElementException as nse:
                logger.exception(nse)
                try:
                    conteudo = conteudo.find_element(By.CLASS_NAME,
                                                     'c-news__body')
                except NoSuchElementException as nxe:
                    logger.exception(nxe)
                    raise
                pass

            paragrafos = conteudo.find_elements(By.TAG_NAME, 'p')

            logger.info('=== Numero de parágrafos: {} ==='.format(
                len(paragrafos)))

            materia = ''
            for p in paragrafos:
                materia = materia + p.get_attribute('innerHTML')
            filename = re.sub(r'/', '-', href) + re.sub(
                r':', '-', data.strftime('%Y-%m-%d %H:%M:%S'))
            filename_path = os.path.join(self.download_dir, filename)
            logger.info('***FILENAME: {}***'.format(filename))
            with open(filename_path, 'w') as f:
                f.write(href)
                f.write(materia)
                f.flush()
                f.close()
        except Exception as ex:
            logger.exception(ex)
        finally:
            driver.close()
Ejemplo n.º 10
0
    def scrap(self, termos, data_inicio, data_fim, max_paginas=50):

        driver = chrome.inicializar_driver(self.download_dir)

        search_url = self.search_page_url + 'q=' + termos + '&filter=news'

        logger.info(search_url)

        driver.get(search_url)

        logger.info('Iniciando raspagem dos resultados')

        try:
            next_page = True
            page_count = 1
            while next_page and page_count < max_paginas:

                logger.info('=== Acessando página {} ==='.format(page_count))

                results = WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located(
                        (By.CLASS_NAME, 'css-1v7bxtk-StyledContainer')))

                links_list = results.find_elements(By.CLASS_NAME,
                                                   'css-2e0sc2-Promo')

                logger.debug('=== QTDE LINKS: {} ==='.format(len(links_list)))
                link_counter = 0

                for link in links_list:
                    link_counter += 1
                    logger.debug(
                        '=== Acessando link n_o {} ==='.format(link_counter))
                    logger.info(link)

                    date_div = link.find_element(
                        By.CLASS_NAME, 'css-1hizfh0-MetadataSnippet')
                    date_spans = date_div.find_elements(By.TAG_NAME, 'span')
                    data = None
                    for span in date_spans:
                        logger.info(span.get_attribute('innerHTML'))
                        span_content = span.get_attribute('innerHTML')
                        if len(span_content) == 11 or len(span_content) == 10:
                            data = span_content

                    if data is None:
                        break
                    try:
                        data_data = dt.strptime(data, '%d %b %Y')
                        logger.info('=== DATA  {} ==='.format(data_data))

                        href = link.find_elements(By.TAG_NAME,
                                                  'a')[0].get_attribute('href')

                        if dt.strptime(data_inicio,
                                       '%Y-%m-%d') <= data_data <= dt.strptime(
                                           data_fim, '%Y-%m-%d'):
                            try:
                                self.treat_link(href, data)
                            except Exception as ex:
                                pass
                        else:
                            logger.debug('data nao passou {0} '.format(data))
                    except ValueError as vex:
                        pass

                if link_counter >= 10:
                    page_count += 1
                    url_proxima_pagina = search_url + '&page={}'.format(
                        page_count)
                    driver.get(url_proxima_pagina)
                else:
                    next_page = False
        except Exception as ex:
            raise
        finally:
            driver.close()