def crawler(urls: hug.types.multiple, word: hug.types.text): validate = Validate() crawler = Crawler() valid_urls = validate.is_valid_urls(urls) if valid_urls == False: return {'error': 'Send valid urls'} data = {'data': []} for url in urls: data['data'].append(crawler.find_word_url(url, word)) return data
def main(p, d, q, f, o, idf): doCrawler = True doParser = True if o == '0': doParser = False elif o == '1': doCrawler = False dir_ = '' ''' # add '/' to end of directory name if it doesn't exists if directory.endswith('/'): self.__dir = directory else: self.__dir = directory + '/' # add current working directory to start of the path if '/' doesn't exists if not self.__dir.startswith('/'): self.__dir = os.getcwd() + '/' + self.__dir ''' # add '/' to end of directory name if it doesn't exists if d.endswith('\\'): dir_ = d else: dir_ = d + '\\' # add current working directory to start of the path if '/' doesn't exists if not dir_.startswith('\\'): dir_ = os.getcwd() + '\\' + dir_ if doCrawler: if q == 0: print('Crawling was started...') print('---------------------------------') c = Crawler('https://osobne-auta.autobazar.sk/?p[order]=23&p[page]=', dir_, q, p, f) c.getUrlList() c.downloadContent() if doParser: if q == 0: print('Parsing was started...') print('---------------------------------') files = [dir_ + f for f in os.listdir(dir_)] es = Elasticsearch() p = Parser(es, q, idf) p.parseData(files)
def run(self): chrome = myBrowser(headless=self.head) reportDirectory = "{0}\{1}_{2}".format( self.directory, self.username, str(datetime.datetime.now()).split('.')[0].replace(' ', '_').replace( ':', '')) chrome.set_download_dir(reportDirectory) chrome.init_browser() parser = myParser() crawler = Crawler(chrome, parser, self.username, self.password, self.since, self.till, self.status, reportDirectory, self.fromfile, self.reader_directory) if crawler.login_szoi(): crawler.order_by_status() crawler.select()
artist = input('Digite o nome do artista: ') # Tratamento de input de usuário que deve ser um número inteiro try: number_songs = int(input('Digite a quantidade de músicas que deseja pesquisar o default são 15: ')) except ValueError: number_songs = 15 print('\nComo não foi informado um número inteiro será retornado 15 músicas do artista/banda') request = Request(url, artist) content = request.content_page() # retorna o texto html da página do artista/banda escolhido # Condição que faz a verificação se a página foi encontrada if content == False: print('\nA página do cantor escolhido não foi encontrada') sys.exit() crawler = Crawler(content, number_songs) musics = crawler.find_musics() # retorna uma lista com as músicas i = 0 print('\nForam encontradas %d músicas da banda/artista %s' % (crawler.number_songs(musics), artist.upper())) # Faz a iteração na lista de músicas for music in musics: print('%d) %s' % (i, music)) i = i + 1 except IndexError: print('O site informado não existe\n') # Condição para sair do laço caso o usuário não queira fazer mais pesquisas continue_search = input('\nDigite (sim) se deseja fazer outra pesquisa: ') if (continue_search.lower() != 'sim'): sys.exit()
def test_number_songs(): request = Request('http://www.vagalume.com.br', 'michael jackson') crawler = Crawler(request.content_page(), 20) musics = crawler.find_musics() total = crawler.number_songs(musics) assert type(total) == type(int())
def test_find_musics(): request = Request('http://www.vagalume.com.br', 'michael jackson') crawler = Crawler(request.content_page(), 20) assert type(crawler.find_musics()) == type(list())
def test_crawler_invalid_url(): crawler = Crawler() result = crawler.find_word_url('invalidurl', 'assine') assert result['success'] == False
def test_crawler_valid_url(): crawler = Crawler() result = crawler.find_word_url('uol.com.br', 'assine') assert result['success'] == True
'show': show, 'review_link': link, 'episode_title': episode_title, 'season': season, 'episode': episode, 'grade': grade, 'date': date, 'reviewer': { 'reviewer_name': reviewer_name, 'kinja_link': reviewer_link }, 'show_stub': show_stub } crawler = Crawler(headless=True) crawler.get('https://www.avclub.com/c/tv-review') show_menu = crawler.driver.find_elements_by_xpath("//*[contains(text(), 'All Categories')]")[0] show_menu.click() shows = [_.text for _ in crawler.driver.find_elements_by_xpath("//ul//li[@role='option']//span") if _.text != 'All Categories'] error_links = [] output_data = {'data': {}, 'meta': {'date_retrieved': str(datetime.datetime.now())}} for show in shows: print(show) if show in output_data['data'].keys(): continue
from classes.crawler import Crawler from classes.console import Console from config.config import Config # Set connection to the origin DB config = Config() # Initialize main calculation class crawler = Crawler(config.config_initial) # Initialize console object console = Console(config.getMinAllowedInputs()) # Check if user's input is correct # and has valid characters for calculation while True: # Ask user input and return filtered result # acceptable for further calculation parameters = console.askInput() if parameters and any(s in parameters for s in config.getMinAllowedInputs()): # Perform position calculation result = crawler.calculateLocation(parameters) print(result) break print("Initial parameters are not correct. Please try again")
import time import datetime import logging import multiprocessing as mp import urllib3 from classes.database import Database from classes.crawler import Crawler urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) logging.basicConfig(level=logging.INFO) database = Database('database.db') crawler = Crawler(logging) APP_BASE_DIR = os.path.dirname(os.path.realpath(__file__)) SCRAPPED_URLS = [] def find_assets(link): '''Find assets''' if len(SCRAPPED_URLS) > 5000: SCRAPPED_URLS.clear() assets = database.fetchall( 'SELECT id, asset, criticity from patrowl_assets') if assets: html_source = crawler.get_source(link) for asset in assets: