Example #1
0
def crawler(urls: hug.types.multiple, word: hug.types.text):
    validate = Validate()
    crawler = Crawler()
    valid_urls = validate.is_valid_urls(urls)
    if valid_urls == False:
        return {'error': 'Send valid urls'}

    data = {'data': []}
    for url in urls:
        data['data'].append(crawler.find_word_url(url, word))
    return data
Example #2
0
def main(p, d, q, f, o, idf):
    doCrawler = True
    doParser = True
    if o == '0':
        doParser = False
    elif o == '1':
        doCrawler = False

    dir_ = ''
    '''
	# add '/' to end of directory name if it doesn't exists
	if directory.endswith('/'):
		self.__dir = directory
	else:
		self.__dir = directory + '/'
	# add current working directory to start of the path if '/' doesn't exists
	if not self.__dir.startswith('/'):
		self.__dir = os.getcwd() + '/' + self.__dir
	'''
    # add '/' to end of directory name if it doesn't exists
    if d.endswith('\\'):
        dir_ = d
    else:
        dir_ = d + '\\'
    # add current working directory to start of the path if '/' doesn't exists
    if not dir_.startswith('\\'):
        dir_ = os.getcwd() + '\\' + dir_

    if doCrawler:
        if q == 0:
            print('Crawling was started...')
            print('---------------------------------')
        c = Crawler('https://osobne-auta.autobazar.sk/?p[order]=23&p[page]=',
                    dir_, q, p, f)
        c.getUrlList()
        c.downloadContent()
    if doParser:
        if q == 0:
            print('Parsing was started...')
            print('---------------------------------')
        files = [dir_ + f for f in os.listdir(dir_)]
        es = Elasticsearch()
        p = Parser(es, q, idf)
        p.parseData(files)
Example #3
0
    def run(self):
        chrome = myBrowser(headless=self.head)
        reportDirectory = "{0}\{1}_{2}".format(
            self.directory, self.username,
            str(datetime.datetime.now()).split('.')[0].replace(' ',
                                                               '_').replace(
                                                                   ':', ''))
        chrome.set_download_dir(reportDirectory)
        chrome.init_browser()

        parser = myParser()

        crawler = Crawler(chrome, parser, self.username, self.password,
                          self.since, self.till, self.status, reportDirectory,
                          self.fromfile, self.reader_directory)

        if crawler.login_szoi():
            crawler.order_by_status()
            crawler.select()
Example #4
0
        artist = input('Digite o nome do artista: ')
        # Tratamento de input de usuário que deve ser um número inteiro
        try:
            number_songs = int(input('Digite a quantidade de músicas que deseja pesquisar o default são 15: '))
        except ValueError:
            number_songs = 15
            print('\nComo não foi informado um número inteiro será retornado 15 músicas do artista/banda')

        request = Request(url, artist)
        content = request.content_page() # retorna o texto html da página do artista/banda escolhido
        # Condição que faz a verificação se a página foi encontrada
        if content == False:
            print('\nA página do cantor escolhido não foi encontrada')
            sys.exit()

        crawler = Crawler(content, number_songs)
        musics = crawler.find_musics() # retorna uma lista com as músicas
        i = 0
        print('\nForam encontradas %d músicas da banda/artista %s' % (crawler.number_songs(musics), artist.upper()))
        # Faz a iteração na lista de músicas
        for music in musics:
            print('%d) %s' % (i, music))
            i = i + 1
    except IndexError:
        print('O site informado não existe\n')

    # Condição para sair do laço caso o usuário não queira fazer mais pesquisas
    continue_search = input('\nDigite (sim) se deseja fazer outra pesquisa: ')
    if (continue_search.lower() != 'sim'):
        sys.exit()
def test_number_songs():
    request = Request('http://www.vagalume.com.br', 'michael jackson')
    crawler = Crawler(request.content_page(), 20)
    musics = crawler.find_musics()
    total = crawler.number_songs(musics)
    assert type(total) == type(int())
def test_find_musics():
    request = Request('http://www.vagalume.com.br', 'michael jackson')
    crawler = Crawler(request.content_page(), 20)
    assert type(crawler.find_musics()) == type(list())
Example #7
0
def test_crawler_invalid_url():
    crawler = Crawler()
    result = crawler.find_word_url('invalidurl', 'assine')
    assert result['success'] == False
Example #8
0
def test_crawler_valid_url():
    crawler = Crawler()
    result = crawler.find_word_url('uol.com.br', 'assine')
    assert result['success'] == True
		'show': show,
		'review_link': link,
		'episode_title': episode_title,
		'season': season,
		'episode': episode,
		'grade': grade,
		'date': date,
		'reviewer': {
			'reviewer_name': reviewer_name,
			'kinja_link': reviewer_link
			},
		'show_stub': show_stub
	}


crawler = Crawler(headless=True)
crawler.get('https://www.avclub.com/c/tv-review')

show_menu = crawler.driver.find_elements_by_xpath("//*[contains(text(), 'All Categories')]")[0]
show_menu.click()

shows = [_.text for _ in crawler.driver.find_elements_by_xpath("//ul//li[@role='option']//span") if _.text != 'All Categories']

error_links = []
output_data = {'data': {}, 'meta': {'date_retrieved': str(datetime.datetime.now())}}

for show in shows:
	print(show)

	if show in output_data['data'].keys():
		continue
Example #10
0
from classes.crawler import Crawler
from classes.console import Console
from config.config import Config

# Set connection to the origin DB
config = Config()

# Initialize main calculation class
crawler = Crawler(config.config_initial)

# Initialize console object
console = Console(config.getMinAllowedInputs())

# Check if user's input is correct
# and has valid characters for calculation
while True:
    # Ask user input and return filtered result
    # acceptable for further calculation
    parameters = console.askInput()
    if parameters and any(s in parameters
                          for s in config.getMinAllowedInputs()):
        # Perform position calculation
        result = crawler.calculateLocation(parameters)
        print(result)
        break

    print("Initial parameters are not correct. Please try again")
Example #11
0
import time
import datetime
import logging
import multiprocessing as mp

import urllib3

from classes.database import Database
from classes.crawler import Crawler

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

logging.basicConfig(level=logging.INFO)
database = Database('database.db')

crawler = Crawler(logging)

APP_BASE_DIR = os.path.dirname(os.path.realpath(__file__))
SCRAPPED_URLS = []


def find_assets(link):
    '''Find assets'''
    if len(SCRAPPED_URLS) > 5000:
        SCRAPPED_URLS.clear()

    assets = database.fetchall(
        'SELECT id, asset, criticity from patrowl_assets')
    if assets:
        html_source = crawler.get_source(link)
        for asset in assets: