Ejemplo n.º 1
0
    def start( self ):
        for index, news in enumerate( self.news_list, start=0 ):
            try:
                if news['status'] == 'pending':
                    news_content = self.download_news( news )

                    if news_content:
                        self.news_list[ index ]['status'] = 'completed'
                        self.news.append( news_content )

                        log.success('[ {nid} ] Dados salvos com sucesso!'.format(nid=news['id']))

                        print()
                        print()
                    else:
                        error_message = 'Não foi possível fazer o parse dos dados.'
                        log.error( error_message )
                        self.errors.append( error_message )
                        self.news_list[ index ]['errors'].append( error_message )
                else:
                    log.warning('Dados já adquiridos [ {nid} ]'.format(nid=news['id']))
            except Exception as error:
                log.error('Erro ao baixar a notícia [ {nid} ]'.format(nid=news['id']))
                log.error(error)
                pass
            finally:
                helper.create_file( filename=self.dump_file, content=self.news, format='json', mode='w')
                helper.create_file( filename=self.news_json_file, content=self.news_list, format='json', mode='w')
Ejemplo n.º 2
0
	def download(type, filename, nid, url):
		if type == 'image':
			try:
				response = requests.get(url, stream=True)

				with open(filename, 'wb') as image:
					shutil.copyfileobj(response.raw, image)
				log.success('Imagem baixada com sucesso [{url}]'.format(url=url))

				return True
			except Exception as error:
				log.error(error)
				pass
Ejemplo n.º 3
0
def set_image( news, index, link ):
    images_file = 'data/images.json'
    images = helper.read_file( images_file, format='json' ) if os.path.isfile( images_file ) else []

    try:
        images.append({
            'catalog': news['catalog'],
            'notice': news['id'],
            'downloaded': False,
            'original_path': link,
            'new_path': set_image_link( news, index, link )
        })

        helper.create_file(images_file, images, mode='w', format='json')
        log.success('Imagem adicionada para a lista de downloads [ {image_link} ]'.format(image_link=set_image_link( news, index, link )))
    except Exception as error:
        log.error( error )
Ejemplo n.º 4
0
    def __init__( self ):
        super( Images, self ).__init__()

        self.images_file = 'data/images.json'
        self.images_folder = 'data/news/'
        self.dump_file     = 'data/news/dump.json'

        if os.path.isfile( self.images_file ):
            images = helper.read_file( self.images_file, format='json' )

            for index, image in enumerate(images, start=0):
                try:
                    if not image['downloaded']:
                        path = 'data/{image_path}'.format(image_path=image['new_path'].replace('https://static.weg.net/', ''))
                        filename = os.path.basename( path )
                        folder = path.split('/')
                        folder.pop()
                        folder = '/'.join( folder )
                        base_url = 'http://www.weg.net'
                        download_url = image['original_path']

                        if not os.path.isdir( folder ):
                            os.makedirs(folder, exist_ok=True)

                        if not download_url.startswith('http'):
                            download_url = '{base_url}/{path}'.format(base_url=base_url, path=download_url)

                        if helper.download(type='image', filename=path, nid=index, url=download_url):
                            images[ index ]['downloaded'] = True
                            log.success('Imagem baixada com sucesso [ {path} ]'.format(path=path))
                    else:
                        log.warning('Imagem já baixada [ {url} ]'.format(url=image['new_path']))
                except Exception as error:
                    log.error( error )
                finally:
                    helper.create_file(self.images_file, images, mode='w', format='json')
        else:
            log.error('[!] Dump de imagens não existe')
Ejemplo n.º 5
0
    def __init__( self ):
        super( Data, self ).__init__()

        self.news_list_file = 'data/notices.list'
        self.news_json_file = 'data/notices.json'
        self.dump_file = 'data/dump.json'
        self.proccess = os.getpid()
        self.errors = []
        self.news_id_length = 4

        init_message = 'Iniciando processo: {proccess}'.format(proccess=self.proccess)
       
        log.success( '=' * len( init_message ) )
        log.success( init_message )
        log.success( '=' * len( init_message ) )
        print()
Ejemplo n.º 6
0
    def download_news( self, news ):
        init_crawling = '= Iniciando crawling, alvo: [ {nid} ] {link}'.format(nid=news['id'], link=os.path.basename( news['link'] ))

        print()

        log.success( '=' * len( init_crawling ) )
        log.success( init_crawling )
        log.success( '=' * len( init_crawling ) )

        print()

        request = requests.get( news['link'] )
        document = BeautifulSoup( request.text, 'html.parser' )

        if request.status_code == 200:
            return parser.parse_news( news, document )
        else:
            error_message = 'Erro ao acessar a página: Status {status_code}'.format(status_code=request.status_code)
            self.errors.append( error_message )
            log.error( error_message )
Ejemplo n.º 7
0
                pass
            finally:
                helper.create_file( filename=self.dump_file, content=self.news, format='json', mode='w')
                helper.create_file( filename=self.news_json_file, content=self.news_list, format='json', mode='w')


if __name__ == '__main__':
    scrapper = Scrapper()

    try:
        scrapper.start()
    except Exception as error:
        print()
        error_message = 'Erro ao iniciar processo: {proccess}'.format(proccess=scrapper.proccess)
        log.error('=' * len( error_message ))
        log.error( error_message )
        log.error(error)
        log.error('=' * len( error_message ))
        print()
    finally:
        finished_with_errors = 'Finalizado com {errors} erro{suffix}'.format(errors=len( scrapper.errors ), suffix='s' if len( scrapper.errors ) > 1 else '')
        finished_without_errors = 'Finalizado sem erros'

        if scrapper.errors:
            print()
            log.warning( '=' * len( finished_with_errors ) )
            log.warning( finished_with_errors )
        else:
            print()
            log.success( '=' * len( finished_without_errors ) )
            log.success( finished_without_errors )
Ejemplo n.º 8
0
def get_content( news, content ):
    if not content[0]: return ''

    allowed_images_extension = ['.jpeg', '.jpg', '.png', '.gif', '.bmp', '.tif']
    document = BeautifulSoup( content[0].encode('utf-8'), 'html.parser' )
    to_remove = ['comparison', 'bgdark', 'bglight', 'default', 'clr', 'novaJanela']
    link = news['link']
    catalog = news['catalog']
    nid = news['id']

    for item in to_remove:
        if document.select('.{selector}'.format(selector=item)):
            for element in document.select('.{selector}'.format(selector=item)):
                index = element['class'].index( item )
                del element['class'][ index ]

    if document.select('.center'):
        for center in document.select('.center'):
            center['class'] = 'text-center'

    if document.select('p'):
        paragraphs = document.select('p')

        for paragraph in paragraphs:
            for content in paragraph.contents:
                if content == '\xa0' or not content:
                    paragraph.decompose()

    if document.select('table'):
            tables = document.select('table')
            tablefilename = 'logs/weg/tables.list'
            link = link if isinstance( link, str ) else link.attrs['href']
            table_log = '[ {nid} ]: {link}\n'.format(link=link, nid=nid)

            for table in tables:
                to_remove = ['cellpadding', 'border', 'cellspacing', 'width', 'height']
                responsive = document.new_tag('div')
                responsive['class'] = 'table-responsive'
                table.wrap( responsive )

                table['class'].append('table table-bordered table-hover')

                for item in to_remove:
                    del table[ item ]

            if os.path.isfile( tablefilename ):
                content = helper.read_file( tablefilename )

                if link not in content:
                    helper.create_file(tablefilename, table_log)
                else:
                    log.warning('Tabela já adicionada para a lista [ {url} ]'.format(url=link))
            else:
                helper.create_file(tablefilename, table_log)
                log.success('Log de tabelas criado.')

    if document.select('a'):
        for index, link in enumerate( document.select('a'), start=0 ):
            if 'href' in link.attrs:
                filename, file_extension = os.path.splitext( link.attrs['href'] )

                if link.attrs['href'] == 'javascript:void();':
                    link.attrs['href'] = '#{nid}'.format(nid=news['id'])
                    link.attrs['data-prevent-default'] = 'true'

                if file_extension in allowed_images_extension:
                    set_image( news, index, link.attrs['href'] )
                    link.attrs['href'] = set_image_link( news, index, link.attrs['href'] )

    if document.select('img'):
        for index, image in enumerate( document.select('img'), start=0 ):
            filename, file_extension = os.path.splitext( image.attrs['src'] )
            responsive = True

            if file_extension in allowed_images_extension:
                set_image( news, index, image.attrs['src'] )
                image.attrs['src'] = set_image_link( news, index, image.attrs['src'] )

            # for parent in image.parents:
            #     if 'class' in parent.attrs:
            #         if 'coluna6' in parent.attrs['class']:
            #             responsive = False
            # if responsive:
            #     if 'class' in image.attrs:
            #         image.attrs['class'].append('img-responsive')
            #     else:
            #         image.attrs['class'] = 'img-responsive'

    if document.select('.coluna6'):
        columns = document.select('.coluna6')

        for column in columns:
            column['class'] = 'xtt-gallery pull-right'

    if document.select('ul'):
        for ul in document.select('ul'):
            ul['class'] = 'xtt-list-style'

            for li in ul.select('> li'):
                span = document.new_tag('span')
                span.string = li.contents[0]
                li.string = ''
                li.append( span )

    return str( document ).strip()