def execute(self): myScraper = Scraper(self.url,self.matchingDict) result = myScraper.scrape() if self.target is None: return result else: self.target(result, self.url)
def create_job(): worker = Scraper(FREQUENCY, TAB, UNIT, FINAL_YEAR) while True: item = q.get() worker.do_work(item) print(item + ' is downloaded | ' + str(q.qsize()) + ' item(s) left') q.task_done()
def save_info_from(href, data_dir): # initialize child destination scrap = Scraper(href) dest = scrap.create_destination() dest.children_href = scrap.get_children() # check if we have already crawled this area OBJECT_OUTFILE = data_dir + dest.nickname + '.pickle' if os.path.exists(OBJECT_OUTFILE): print dest.nickname + ' has already been crawled' pass else: if not os.path.isdir(os.path.dirname(OBJECT_OUTFILE)): os.makedirs(os.path.dirname(OBJECT_OUTFILE)) # traverse tree of areas-->routes all_dest = traverse(dest) # returns destination object # write out to file.. for viz?? BIG_JSON = data_dir + dest.nickname + '.json' with open(BIG_JSON, 'w+') as dump: flat = json.dumps(all_dest, default=lambda o: o.__dict__) dump.write(flat) # save destination object as pickle BIG_PICKLE = data_dir + dest.nickname + '.pickle' with open(BIG_PICKLE, 'wb') as handle: pickle.dump(all_dest, handle) flourish = '<<<'+'-'*25 print flourish + dest.nickname + flourish[::-1] print
def main(): uid = str(uuid.uuid4()) print "Creating Scraper() instance ..." scraper = Scraper(uid) scraper.run() print "Running tests ..." # typelink() test_typelink(scraper) # checkmatch() test_checkmatch(scraper) # getpagelinks test_getpagelinks(scraper) # folowlinks() test_followlinks(scraper) # get scraper status text_getstatus(scraper) scraper.stop(); print "Done."
def scrape(): scraper = Scraper(**get_creds()) # Fetch usage info re: boosters. le = UsageDataPoint( time=datetime.datetime.utcnow(), **scraper.fetch_booster_usage() ) db_session.add(le) yield le # Fetch latest transactions and put these in the DB, # but only if we don't already have them. for transaction in scraper.fetch_most_recent_transactions(): existing = KoodoTransaction \ .query \ .filter_by(koodo_id=transaction['koodo_id']) \ .first() if not existing: kt = KoodoTransaction(**transaction) db_session.add(kt) yield kt db_session.commit()
def scrape(request, tvdb_id): """ Takes a scrape request, constructs a Scraper object and performs a scrape for the show if it hasn't been scraped before or hasn't been scraped within the last :math:`x` days (where :math:`x` is the number of days specified by RESCRAPE_AFTER). Otherwise if the show exists and has been scraped within the last :math:`x` days redirect to the appropriate show page :param request: A Scrape request object. :param tvdb_id: The id of the tv show to be scraped (or shown) :return: A HttpResponse Object containing the page of the show requested. """ # Determine if the show already exists in the datastore q = TVShow.get_by_key_name(tvdb_id) if users.is_current_user_admin() and 'force' in request.GET and request.GET[ 'force'] == '1': Scraper(tvdb_id, rescrape=True, options=q.options) return HttpResponseRedirect('/show/{0}'.format(q.url_string)) # Check if the show has been scraped before and if that scrape was in the last x days specified by RESCRAPE_AFTER if q and q.last_scraped > datetime.now() - timedelta(days=RESCRAPE_AFTER): url_slug = q.url_string else: # If scraping is switched on then scrape the show if settings.SCRAPING: s = Scraper(tvdb_id) url_slug = s.get_url_slug() else: url_slug = tvdb_id return HttpResponseRedirect('/show/{0}'.format(url_slug))
def test_find_docs(): declare_test_start( 'follow_link' ) url_data = { 'url_id': 1, 'target_url': 'http://timduffy.me/', 'max_link_level': 6, 'creation_date_time': str(datetime.datetime.now()), 'doc_type': 'application/pdf', 'dispatch_datetime': str(datetime.datetime.now()), 'allowed_domains': [], } uid = str(uuid.uuid4()) scraper = Scraper(uid) scraper.set_url_data(url_data) docs = scraper.find_docs( ) print '[ TEST ] {0}'.format(json.dumps(scraper.status)) print '[ TEST ] {0}'.format(json.dumps(docs)) passed = False if len(docs) > 0: passed = True declare_test_end( passed )
def main(request): username = request.POST['username'] password = request.POST['password'] scraper = Scraper(username, password) data = scraper.scrap(True) return HttpResponse(obj2json(data), mimetype='application/json')
def __init__(self, base_url = awards_base_url, search_url = ""): Scraper.__init__(self, base_url, search_url) self.file = open('academy_awards.csv', 'wb') self.writer = csv.writer(self.file, delimiter='\t') self.writer.writerow(['Year', 'Category', 'Won', 'FilmName', 'ActorDirectorName']) self.soup = self.connect(base_url) self.next_record = '1'
def traverse(node): """ Pre-order depth-first search of Mountain Project tree """ children = [] for href in node.children_href: # initialize Scraper for this page scrap = Scraper(href) if scrap.soup is None: pass else: # grab features from the soup dest = scrap.create_destination() # find children in the soup if any dest.children_href = scrap.get_children() # recursively deeper down the tree if this is an area if dest.children_href != None: print print '**'+dest.nickname+'**' traverse(dest) # inner traverse function has returned with destination object print dest.nickname + ' | ' + dest.href children.append(dest) node.children = children return node
def testExtractTag(self): pattern = "<a name='$name'></a>" _scraper = Scraper(pattern) exp = BeautifulSoup(pattern) # one attribute actual = BeautifulSoup("<a name='abc'></a>") self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name']) # one attribute actual = BeautifulSoup("<a name='abc' age='27'></a>") self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name']) # two attributes pattern = "<a name='$name' age='$age'></a>" exp = BeautifulSoup(pattern) actual = BeautifulSoup("<a name='abc' age='27'></a>") ret = _scraper.extractTag(exp.contents[0], actual.contents[0]) self.assertEqual(2, len(ret)) self.assertEqual('abc', ret['name']) self.assertEqual('27', ret['age']) # get attribute from sub tag pattern = "<a><b name='$name'></b></a>" exp = BeautifulSoup(pattern) # one attribute actual = BeautifulSoup("<a><b name='abc'></b></a>") self.assertEqual('abc', _scraper.extractTag(exp.contents[0], actual.contents[0])['name'])
def testExtractText(self): pattern = "<a>$text</a>" _scraper = Scraper(pattern) exp = BeautifulSoup(pattern) # one text actual = BeautifulSoup("<a>hello world</a>") self.assertEqual('hello world', _scraper.extractText(exp.contents[0], actual.contents[0])['text'])
def scrape_all(root_href, data_dir): """ Scrape Mountain Project and save Destination objects """ scrap = Scraper(root_href) # iterate over children of the root (e.g. states in the US) for href in scrap.get_children(): save_info_from(href, data_dir)
def run(self): try: print "y" s = Scraper('en') #s.getCategory(self.cat) s.getGameList(self.cat, endPage=0) except: print "n"
def testExtract(self): pattern = "<a name='$name'>$text</a>" _scraper = Scraper(pattern) exp = BeautifulSoup(pattern) # text in sub tag actual = BeautifulSoup("<a name='abc'>hello world</a>") ret = _scraper.extract(actual.contents[0]) self.assertEqual('hello world', ret['text'])
def main(): print "ENTER THE YOSPOS" print "Using " + sys.argv[1] + " @ threadid: " + sys.argv[3] new_scraper = Scraper(sys.argv[1], sys.argv[2], sys.argv[3], []) new_scraper.scrape_thread() new_message = raw_input("Reply: ") new_poster = Poster(sys.argv[1], sys.argv[2], sys.argv[3]) new_poster.make_post(new_message)
def __init__(self): Scraper.__init__(self) api_key = self.config["youtube"]["api_key"] self.url = "https://www.googleapis.com/youtube/v3/search" self.params = { "order": "date", "maxResults": 10, "channelId": "UCH1dpzjCEiGAt8CXkryhkZg", "key": api_key, "type": "upload", "part": "snippet" }
def doador_2004(cnpj_ou_cpf): u''' Retorna uma tabela com as doações desta pessoa (cnpj_ou_cpf). A tabela é uma lista de listas, cada uma contendo os campos em "doador_2004.campos". >>> tabela = doador_2004('85.907.012/0001-57') >>> tabela is not None True >>> len(tabela) 16 >>> len(tabela[0]) == len(doador_2004.campos) True URL: http://www.tse.gov.br/internet/eleicoes/2004/prest_blank.htm ''' pessoa = pessoa_or_valueerror(cnpj_ou_cpf) scraper = Scraper() url = 'http://www.tse.gov.br/sadEleicao2004Prestacao/spce/index.jsp' scraper.open(url) scraper.browser.select_form(name='formDoador') scraper.browser.form.find_control(name='nome').readonly = False scraper.browser.form.find_control(name='numero').readonly = False scraper.browser.form['numero'] = pessoa.plain() scraper.browser.form['nome'] = '%' try: scraper.submit() except: return None if not scraper.html.find(text=regexp('Valor Total de Fornecimento')): return None table = scraper.html.findAll('table')[-1] lines = [] for tr in table.findAll('tr')[1:-1]: columns = [] for td in tr.findAll('td'): try: contents = td.b.contents except: contents = td.contents content = ' '.join(contents).strip() text = html2unicode(content) columns.append(text) lines.append(columns) return lines
def scrapestruct(self, context ): pc = context['pc'] rowscrape = pc['dom_row_pattern'] blockstr = context['blockstr'] soupdoc = CustomizedSoup( blockstr ) scraper = Scraper( rowscrape ) results = scraper.match( soupdoc ) if( len(results) == 0 ): #TBD scraper need to be imporved raise Exception("0 ITEMS SCRAPED WARNING") count = min(len(results), 10 ) items = results[0:count] eitems = map( lambda i:scraper.extract(i), items) context['items'] = eitems
def performUpdate(): scr = Scraper() try: for competitorListChunk in scr.scrape(): for competitor in competitorListChunk: try: Contestant.update_completed_count(competitor.username.lower(), competitor.completedCount) except Exception as e: print("ERR: Username most likely not found in spreadsheet {}. {}".format( competitor.username, str(e))) except Exception: return
def testMatchByType(self): # test simple tag pattern = "<a></a>" _scraper = Scraper(pattern) exp = BeautifulSoup(pattern) # same type actual = BeautifulSoup("<a></a>") self.assertTrue(_scraper.matchByType(exp.contents[0], actual.contents[0])) # different type actual = BeautifulSoup("text") self.assertFalse(_scraper.matchByType(exp.contents[0], actual.contents[0]))
def get(self): global total_data, crawl_count, crawled if crawl_count >= DEPTH_LIMIT: return False crawled.add(self.url) data = self.fetch() if data and data != bytearray(b' '): if total_data > CONTENT_LIMIT: return False total_data += len(data) crawl_count += 1 webserver.save(self.url, self.root, self.type, data) s = Scraper(data, self.console) if self.type not in ["JS", "CSS"]: #css css_links = s.get_css() for link in css_links: if link: c = Crawler(link, self, "CSS", self.console) if c.url not in crawled: c.get() else: pass if self.type not in ["JS", "CSS"]: #js js_links = s.get_script() self.console.print(js_links) for link in js_links: if link: c = Crawler(link, self, "JS", self.console) if c.url not in crawled: c.get() else: pass # hrefs if self.type == "HTML": links = s.get_links() for link in links: if link: c = Crawler(link, self, "HTML", self.console) if c.url not in crawled: c.get() else: pass
def analyze(): """ Analyze text from a given URL """ url = request.form.get("url", "").strip() use_reducer = not ("noreduce" in request.form) dump_forest = "dump" in request.form metadata = None # Single sentence (True) or contiguous text from URL (False)? single = False keep_trees = False t0 = time.time() if url.startswith("http:") or url.startswith("https:"): # Scrape the URL, tokenize the text content and return the token list metadata, generator = process_url(url) toklist = list(generator) # If this is an already scraped URL, keep the parse trees and update # the database with the new parse keep_trees = Scraper.is_known_url(url) else: # Tokenize the text entered as-is and return the token list # In this case, there's no metadata toklist = list(tokenize(url)) single = True tok_time = time.time() - t0 t0 = time.time() # result = profile(parse, toklist, single, use_reducer, dump_forest) result, trees = parse(toklist, single, use_reducer, dump_forest, keep_trees) # Add a name register to the result create_name_register(result) parse_time = time.time() - t0 if keep_trees: # Save a new parse result if Settings.DEBUG: print("Storing a new parse tree for url {0}".format(url)) Scraper.store_parse(url, result, trees) result["metadata"] = metadata result["tok_time"] = tok_time result["parse_time"] = parse_time # Return the tokens as a JSON structure to the client return jsonify(result = result)
def scrape(): """ Sets up the scraper to scrape HN and Reddit. """ app.logger.info("Scraping Reddit") scrape_reddit = Scraper(app.logger) scrape_reddit.gather_reddit_data() app.logger.info("Finished gathering data, inserting into DB") scrape_reddit.insert_into_db() app.logger.info("Finished inserting into DB, sleeping for %d minutes..." % (SCRAPING_INTERVAL / 60.0)) threading.Timer(SCRAPING_INTERVAL, scrape).start()
def start_threads_for_letter(startLetter): outFn = "voters_"+str(startLetter)+".txt" outFile = open(outFn, 'w') print("Getting records starting with " + startLetter) scp = Scraper(conf.sessionHeaders, conf.searchHeaders) scp.setup_session([conf.baseUrl, conf.rollSearchUrl]) url = conf.searchUrl params = conf.searchParams params['electorName'] = startLetter scp.get_and_write_records(url, 0, params, outFile)
def __init__(self): Scraper.__init__(self) self.url = "https://go.berniesanders.com/page/event/search_results" self.html = HTMLParser() self.params = { 'orderby': 'zip_radius', 'zip_radius[1]': '6000', 'zip_radius[0]': '78218', 'radius_unit': 'mi', 'country': 'US', 'format': 'json' } self.map = { "id": "original_id", "start_dt": "start_time" }
def writeindexhtm(self): text = Scraper.getIndexhtm() if not os.path.isfile(self.path() + 'index.htm'): with open(self.path() + 'index.htm', 'w') as f: f.write(Patcher.patchindexhtm(text)) else: print 'index.htm already saved'
def getacclist(self, facade, accountList, token, step, allofit): logging.info(" --> STEP-" + str(step)) Scraper.getacclist(self, facade, accountList, token) self.wipe_pages() self.prepParse(allofit, facade, token) page = self.HexToByte( allofit['body']) scrape_result = self.gettheAccounts(facade, accountList, token, step, allofit, page) if scrape_result != 'good' and scrape_result != "got list": logging.warning("Got result - " + scrape_result) self.flush_pages() return scrape_result
def write_Slist(self): text2 = Scraper.getSlist() if not os.path.isfile(self.path() + 'resources\slist.txt'): with open(self.path() + 'resources\slist.txt', 'w') as f: f.write(text2) else: print 'slist.txt already downloaded'
def writeddixml(self): text = Scraper.getddixml() if not os.path.isfile(self.path() + 'ddi.xml'): with open(self.path() + 'ddi.xml', 'w') as f: f.write(text) else: print 'ddi.xml already saved'
class CachedTweets: scraper = Scraper( twitter_authentication=TwitterAuthentication.autodetect_twitter_auth()) @staticmethod @lru_cache(maxsize=None) def tweets_small(): # type: () -> Dict[str, List[TweetResult]] """Return a static list of 9 tweets that is generated once and re-used throughout the module's lifetime.""" return CachedTweets.scraper.scrape_terms( {"flood", "fire", "house fire"}, count=3) @staticmethod @lru_cache(maxsize=None) def tweets_small_no_retweets(): # type: () -> Dict[str, List[TweetResult]] """Return a static list of 9 non-retweet tweets that is generated once and re-used throughout the module's lifetime.""" return CachedTweets.scraper.scrape_terms( {"flood", "fire", "house fire"}, count=3, include_retweets=False) @staticmethod @lru_cache(maxsize=None) def tweets_small_geo(): # type: () -> Dict[str, List[TweetResult]] """ Return a static list of 9 tweets geotagged 20 miles from Chicago's center that is generated once and re-used throughout the module's lifetime. """ return CachedTweets.scraper.scrape_terms( {"flood", "fire", "house fire"}, count=3, geocode=geobox_to_geocode(GEOBOX_CHICAGO, "20mi")) @staticmethod @lru_cache(maxsize=None) def tweets_medium(): # type: () -> Dict[str, List[TweetResult]] """Return a static list of 60 tweets that is generated once and re-used throughout the module's lifetime.""" return CachedTweets.scraper.scrape_terms( {"flood", "fire", "house fire"}, count=20) @staticmethod @lru_cache(maxsize=None) def tweets_medium_no_retweets(): # type: () -> Dict[str, List[TweetResult]] """Return a static list of 60 non-retweet tweets that is generated once and re-used throughout the module's lifetime.""" return CachedTweets.scraper.scrape_terms( {"flood", "fire", "house fire"}, count=20, include_retweets=False) @staticmethod @lru_cache(maxsize=None) def tweets_medium_geo(): # type: () -> Dict[str, List[TweetResult]] """ Return a static list of 60 tweets geotagged 20 miles from Chicago's center that is generated once and re-used throughout the module's lifetime. """ return CachedTweets.scraper.scrape_terms( {"flood", "fire", "house fire"}, count=20, geocode=geobox_to_geocode(GEOBOX_CHICAGO, "20mi")) @staticmethod @lru_cache(maxsize=None) def tweets_large(): # type: () -> Dict[str, List[TweetResult]] """Return a static list of 300 tweets that is generated once and re-used throughout the module's lifetime.""" return CachedTweets.scraper.scrape_terms( {"flood", "fire", "house fire"}, count=100) @staticmethod @lru_cache(maxsize=None) def tweets_large_geo(): # type: () -> Dict[str, List[TweetResult]] """Return a static list of 300 tweets geotagged 20 miles from Chicago's center that is generated once and re-used throughout the module's lifetime.""" return CachedTweets.scraper.scrape_terms( {"flood", "fire", "house fire"}, count=100, geocode=geobox_to_geocode(GEOBOX_CHICAGO, "20mi"))
def callback(ch, method, properties, body): url = json.loads(body)['url'] scraper = Scraper() result = scraper.scrape(url) publish_result(result)
def salva(url_veiculo): xpath_categoria = "//*[contains(@class, 'categoria')]/a//text()" xpath_marca = "//*[contains(@class, 'marca')]/text()" xpath_modelo = "//*[contains(@class, 'modelo')]/text()" http = urllib3.PoolManager() xpath_especs = "//*[@id=\"dados-veiculo\"]/ul[1]/li/span/text()" xpath_preco = "//*[@id=\"dados-veiculo\"]/div/span/text()" xpath_opcionais = "//*[contains(@class, 'opcionais')]//li/text()" scraper = Scraper() print("Xpath Categoria...") resultados = scraper.pega_lista(url_veiculo, xpath_categoria) v = Veiculo o = Opcionais if (len(resultados) > 0): #se nao possuir categoria v.categoria = resultados[0] else: v.categoria = "Sem Categoria" print("Xpath Marca...") resultados = scraper.pega_lista(url_veiculo, xpath_marca) try: m = Marca.objects.create(nome=resultados[0]) m.save() except: m = Marca.objects.get(nome=resultados[0]) print("Xpath modelo...") resultados = scraper.pega_lista(url_veiculo, xpath_modelo) v.modelo = resultados[0] print("Xpath Especificos...") resultados = scraper.pega_lista(url_veiculo, xpath_especs) if (len(resultados) > 3): v.ano_modelo = formata_ano_modelo(resultados[0]) v.cor = resultados[1] v.km = resultados[2] v.combustivel = resultados[3] elif (len(resultados) == 3): v.ano_modelo = formata_ano_modelo(resultados[0]) v.cor = resultados[1] v.km = -1 v.combustivel = resultados[2] print("Xpath Preco...") resultados = scraper.pega_lista(url_veiculo, xpath_preco) r = resultados[0].replace("R$ ", "") r = r.replace(",00", "") print("preco: ", r) if "Consulte" in r: r = None v.preco = r else: v.preco = r marca = Marca.objects.get(nome=m.nome) print("Xpath opcionais...") resultados = scraper.pega_lista(url_veiculo, xpath_opcionais) for resultado in resultados: resultado = resultado.encode('utf-8') try: o = Opcionais.objects.create(nome=resultado) o.save() except: print("") v_hash = marca.nome + v.categoria + v.modelo + v.cor + v.ano_modelo + str( v.km) + v.combustivel + str(v.preco) + ''.join(resultados) v_hash = hashlib.md5(v_hash.encode('utf-8')) obj_hash = str(v_hash.hexdigest()) try: v = Veiculo.objects.create(vei_pk=obj_hash, marca=marca, categoria=v.categoria, modelo=v.modelo, cor=v.cor, ano_modelo=v.ano_modelo, km=v.km, combustivel=v.combustivel, preco=v.preco) v.save() except: print("Veiculo ja existente no BD") for resultado in resultados: try: vop = VeiculoOpcionais.objects.create( veiculo=v, opcionais=Opcionais.objects.get(nome=resultado)) vop.save() except: pass print("Terminado.")
def setUp(self) -> None: self.s = Scraper(database="testscraperdb.cnf", logger=self.logger) url = 'https://www.chewy.com/s?rh=c%3A288%2Cc%3A332&page=1' self.s._enter_update_time_and_count(self.s._get_total_food_count(url))
from scraper import Scraper stats_scraper = Scraper() games = stats_scraper.get_games() print games[0].home() print games[0].away() print stats_scraper.get_teams_rest(0)
def test_credentials_valid(): s = Scraper() assert s.verify_chinesepod_credentials()
class Quiz: colors = { "normal": "\033[1;37;40m", "fire": "\033[1;31;40m", "water": "\033[1;34;40m", "grass": "\033[1;32;40m", "flying": "\033[1;37;40m", "fighting": "\033[1;31;40m", "poison": "\033[1;35;40m", "electric": "\033[1;33;40m", "ground": "\033[1;33;40m", "rock": "\033[1;32;40m", "psychic": "\033[1;35;40m", "bug": "\033[1;32;40m", "ghost": "\033[1;35;40m", "dark": "\033[1;37;40m", "steel": "\033[1;37;40m", "ice": "\033[1;36;40m", "dragon": "\033[1;34;40m", "fairy": "\033[1;33;40m", "correct": "\033[1;32;40m", "incorrect": "\033[1;31;40m", "reset": "\033[0m" } def __init__(self, show_types, all_answers): self.scraped_pokemon = [] self.show_types = show_types self.all_answers = all_answers self.scraper = Scraper() self.national_dex = self.scraper.set_national_dex() def run(self): self.rules() self.setup() correct_answer = self.question() self.ending(correct_answer) def setup(self): # TODO: currently not using the most recent generation until pokemon.com/pokedex is updated # replace the following two lines when it is # poke_id = random.randint(1,len(self.national_dex)) poke_id = random.randint(1,809) self.answer_pokemon = self.find_from_scraped(poke_id) if not self.answer_pokemon: self.answer_pokemon = self.scraper.fetch_pokemon_data(self.national_dex[poke_id-1]) self.scraped_pokemon.append(self.answer_pokemon) def find_from_scraped(self, id): for poke in self.scraped_pokemon: if poke.id == id: return poke return None def rules(self): print("") print("{}Welcome trainer, to the Pokemon Types Quiz!{}".format(self.colors["normal"], self.colors["reset"])) print("You will be presented with a pokemon's name{}, and must guess what type{} effective against it".format((" and types" if self.show_types else ""), ("s are" if self.all_answers else " is"))) def question(self): print("") print("Pokemon's name: {}{}{}".format(self.colors["normal"], self.answer_pokemon.name.title(), self.colors["reset"])) if self.show_types: types = [] for type in self.answer_pokemon.types: types.append(self.colors[type] + type + self.colors["reset"]) print("Pokemon's types(s): {}".format(" & ".join(types))) print("What super effective against {}{}{}".format(self.colors["normal"], self.answer_pokemon.name.title(), self.colors["reset"])) if self.all_answers: print("Enter all possible answers, seperated by a comma and a space (e.g. \"fire, water, bug\")") answer = self.validate_answer() else: answer = self.validate_answer("Enter a single pokemon type: ") return self.check_answer(answer) def validate_answer(self, msg = ""): answer = "" print("(for a list of types, enter \"help\")") while answer is "": answer = input(msg).lower() if answer == "help": self.list_types() answer = "" print("What super effective against {}{}{}".format(self.colors["normal"], self.answer_pokemon.name.title(), self.colors["reset"])) return answer def check_answer(self, answer): if self.all_answers: return self.answer_pokemon.matches_all_weaknesses(answer.split(", ")) else: return self.answer_pokemon.is_weak_to(answer) def list_types(self): types = [] for type in Pokemon.all_types: types.append(self.colors[type] + type + self.colors["reset"]) print("All types: {}".format(", ".join(types))) def ending(self, correct_answer): print() if correct_answer: print(self.colors["correct"] + "Congratulations, you are correct!" + self.colors["reset"]) else: if self.all_answers: print(self.colors["incorrect"] + "Sorry, that was not all of the pokemon's weaknesses. They were:" + self.colors["reset"]) else: print(self.colors["incorrect"] + "Sorry, that is not in the list of the pokemon's weaknesses. They were:" + self.colors["reset"]) weaknesses = [] for weakness in self.answer_pokemon.weaknesses: weaknesses.append(self.colors[weakness] + weakness + self.colors["reset"]) print("Pokemon's weaknesses: {}".format(", ".join(weaknesses))) replay = input("Would you like to play again? (y/n): ").lower() while not replay in ["yes", "y", "no", "n"]: print("I don't understand that input") replay = input("Would you like to play again? (y/n): ").lower() if replay in ["yes", "y"]: self.run() else: print("Thank you for playing, goodbye!")
def __init__(self, show_types, all_answers): self.scraped_pokemon = [] self.show_types = show_types self.all_answers = all_answers self.scraper = Scraper() self.national_dex = self.scraper.set_national_dex()
from scraper import Scraper if __name__ == '__main__': scraper = Scraper() proxies = scraper.scrape(protocol='SSL') while proxies.qsize: proxy = proxies.get() print(proxy)
from flask_restful import Resource, Api from app import app, db from scraper import Scraper api = Api(app) scraper = Scraper(db) scraper.activate() # Scraping on diff thread for faster launchtime import models, resources class Home(Resource): def get(self): jobs = models.Job.query.all() job_schema = models.JobSchema(many=True) output = job_schema.dump(jobs).data return output # @app.route('/path/<path:subpath>') # def show_subpath(subpath): # # show the subpath after /path/ # return 'Subpath %s' % escape(subpath) api.add_resource(Home, '/') api.add_resource(resources.UserRegistration, '/register') api.add_resource(resources.UserLogin, '/login') api.add_resource(resources.ValidateToken, '/validate') if __name__ == '__main__':
def update_data_redis(): s = Scraper() countries, continent = s.scraping() r.set('countries', json.dumps(countries)) r.set('continents', json.dumps(continent)) r.set('total', json.dumps(len(countries) + len(continent)))
class Gui: def __init__(self, root, address): # initialize root and title self.root = root self.root.resizable(0, 0) self.root.title('Weather') # initialize base canvas self.canvas = tk.Canvas(self.root, width=1000, height=500, bg='light blue') self.canvas.pack() # initialize address self.address = address # determine if address was loaded from file if self.address is None: # if none was loaded initialize entry UI self.init_entry_ui() else: # otherwise load weather UI self.init_weather_ui() # initialize address entry UI def init_entry_ui(self): # clear canvas self.clear_canvas() # reset address self.address = None # create prompt text self.prompt_text = self.canvas.create_text(500, 35, text='Enter Address Below:') # create address entry text box self.address_entry_box = tk.Text(self.canvas, bg='white') self.address_entry_box.place(width=600, height=200, x=200, y=50) # create find weather button self.init_weather_button = tk.Button(self.canvas, text='Get Weather', bg='white', command=self.init_weather_ui) self.init_weather_button.place(width=300, height=100, x=350, y=300) # initialize weather UI def init_weather_ui(self): # get address from text entry box self.address = self.address_entry_box.get( '1.0', 'end-1c') if self.address is None else self.address # clear canvas self.clear_canvas() # set up scraper self.web_scraper = Scraper(self.address) # create text objects to display location and weather self.location_text = self.canvas.create_text( 500, 100, text=self.web_scraper.get_location()) if self.web_scraper.get_location() != 'Error: Invalid Address': self.forecast_text = self.canvas.create_text( 500, 115, text=self.web_scraper.get_forecast()) self.temp_c_text = self.canvas.create_text( 500, 130, text=self.web_scraper.get_temp_f()) self.temp_f_text = self.canvas.create_text( 500, 145, text=self.web_scraper.get_temp_c()) # initialize reset button self.reset_button = tk.Button(self.canvas, text='Reset Slot', bg='white', command=self.init_entry_ui) self.reset_button.place(width=300, height=100, x=175, y=300) #initialize refresh button self.refresh_button = tk.Button(self.canvas, text='Refresh Forecast', bg='white', command=self.refresh_weather) self.refresh_button.place(width=300, height=100, x=525, y=300) # clear canvas def clear_canvas(self): # destroy all widgets for widget in self.canvas.winfo_children(): widget.destroy() # destroy text created by weather UI state try: self.canvas.delete(self.location_text) self.canvas.delete(self.forecast_text) self.canvas.delete(self.temp_c_text) self.canvas.delete(self.temp_f_text) except AttributeError: pass # destroy text created by entry UI state try: self.canvas.delete(self.prompt_text) except AttributeError: pass #refresh weather screen def refresh_weather(self): self.clear_canvas() self.init_weather_ui() #get address function def get_address(self): return self.address
class TestScraper(TestCase): @classmethod def setUpClass(cls) -> None: cls.logger = VerboseScraperLogger() def setUp(self) -> None: self.s = Scraper(database="testscraperdb.cnf", logger=self.logger) url = 'https://www.chewy.com/s?rh=c%3A288%2Cc%3A332&page=1' self.s._enter_update_time_and_count(self.s._get_total_food_count(url)) def tearDown(self) -> None: self.s.engine.dispose() def test_scrape_search_results(self): # url for search results containing only 4 foods url = "https://www.chewy.com/s?rh=c%3A288%2Cc%3A332%2Cbrand_facet%3AAdirondack" # dont use number after final /dp/ - corresponds to size of product and doesn't reliable return the # same size; doesn't matter for scraper since we're not looking at price per pound, etc. expected_jobs = { ("https://www.chewy.com/adirondack-30-high-fat-puppy/dp", self.s.scrape_food_if_new), ("https://www.chewy.com/adirondack-26-adult-active-recipe-dry/dp", self.s.scrape_food_if_new), ("https://www.chewy.com/adirondack-large-breed-recipe-dry-dog/dp", self.s.scrape_food_if_new), ("https://www.chewy.com/adirondack-21-adult-everyday-recipe/dp", self.s.scrape_food_if_new) } self.s.scrape_search_results(url) generated_jobs = set() while not self.s.scrape_queue.empty(): job = self.s.scrape_queue.get() job = (job[0].rsplit('/', 1)[0], job[1]) generated_jobs.add(job) self.assertEqual(expected_jobs, generated_jobs) def test__scrape_food_details(self): url1 = "https://www.chewy.com/earthborn-holistic-great-plains-feast/dp/36412" food1, diets1 = self.s._scrape_food_details(url1) self.assertEqual(51256, food1.item_num) self.assertEqual( "Earthborn Holistic Great Plains Feast Grain-Free Natural Dry Dog Food", food1.name) self.assertEqual( "https://www.chewy.com/earthborn-holistic-great-plains-feast/dp/36412", food1.url) test_ingredients1 = ( "Bison Meal, Peas, Pea Protein, Tapioca, Dried Egg, Canola Oil (preserved with Mixed " "Tocopherols), Beef Meal, Pacific Whiting Meal, Pea Starch, Chickpeas, Flaxseed, " "Alaska Pollock Meal, Natural Flavors, Pea Fiber, Blueberries, Cranberries, Apples, " "Carrots, Spinach, Salt, Potassium Chloride, Choline Chloride, DL-Methionine, " "L-Lysine, Taurine, L-Carnitine, Beta-Carotene, Vitamin A Supplement, Vitamin D3 " "Supplement, Vitamin E Supplement, Zinc Sulfate, Ferrous Sulfate, Niacin, Folic Acid, " "Biotin, Manganese Sulfate, Copper Sulfate, Calcium Pantothenate, Thiamine Mononitrate, " "Pyridoxine Hydrochloride, Riboflavin Supplement, L-Ascorbyl-2-Polyphosphate (source of " "Vitamin C), Zinc Proteinate, Manganese Proteinate, Copper Proteinate, Calcium Iodate, " "Sodium Selenite, Cobalt Carbonate, Vitamin B12 Supplement, Yucca Schidigera Extract, " "Rosemary Extract, Dried Enterococcus Faecium Fermentation Product, Dried " "Lactobacillus Casei Fermentation Product, Dried Lactobacillus Acidophilus " "Fermentation Product.") self.assertEqual(test_ingredients1, food1.ingredients) self.assertEqual("Earthborn Holistic", food1.brand) self.assertEqual(None, food1.xsm_breed) self.assertEqual(True, food1.sm_breed) self.assertEqual(True, food1.md_breed) self.assertEqual(True, food1.lg_breed) self.assertEqual(None, food1.xlg_breed) self.assertEqual("Dry Food", food1.food_form) self.assertEqual("Adult", food1.lifestage) self.assertEqual(False, food1.fda_guidelines) test_diets1 = ["Grain-Free", "Gluten Free"] self.assertEqual(test_diets1, diets1) url2 = "https://www.chewy.com/natural-balance-lid-limited/dp/104666" food2, diets2 = self.s._scrape_food_details(url2) self.assertEqual(76793, food2.item_num) self.assertEqual( "Natural Balance L.I.D. Limited Ingredient Diets Sweet Potato & Bison Formula Grain-Free Dry Dog Food", food2.name) self.assertEqual( "https://www.chewy.com/natural-balance-lid-limited/dp/104666", food2.url) test_ingredients2 = ( "Sweet Potatoes, Bison, Potato Protein, Pea Protein, Canola Oil (Preserved with " "Mixed Tocopherols), Dicalcium Phosphate, Natural Flavor, Salmon Oil (Preserved " "with Mixed Tocopherols), Potato Fiber, Salt, Calcium Carbonate, Flaxseed, " "DL-Methionine, Minerals (Zinc Amino Acid Chelate, Zinc Sulfate, Ferrous " "Sulfate, Iron Amino Acid Chelate, Copper Sulfate, Copper Amino Acid Chelate, " "Sodium Selenite, Manganese Sulfate, Manganese Amino Acid Chelate, Calcium " "Iodate), Vitamins (Vitamin E Supplement, Niacin, d-Calcium Pantothenate, " "Vitamin A Supplement, Riboflavin Supplement, Thiamine Mononitrate, Biotin, " "Vitamin B12 Supplement, Pyridoxine Hydrochloride, Vitamin D3 Supplement, " "Folic Acid), Choline Chloride, Taurine, Citric Acid (preservative), Mixed " "Tocopherols (preservative), Rosemary Extract.") self.assertEqual(test_ingredients2, food2.ingredients) self.assertEqual("Natural Balance", food2.brand) self.assertEqual(None, food2.xsm_breed) self.assertEqual(True, food2.sm_breed) self.assertEqual(True, food2.md_breed) self.assertEqual(True, food2.lg_breed) self.assertEqual(None, food2.xlg_breed) self.assertEqual("Dry Food", food2.food_form) self.assertEqual("Adult", food2.lifestage) self.assertEqual(False, food2.fda_guidelines) test_diets2 = [ "Sensitive Digestion", "Limited Ingredient Diet", "No Corn No Wheat No Soy", "Grain-Free" ] self.assertEqual(test_diets2, diets2) def test__enter_in_db(self): import time test_key = hash(time.localtime) test_url = str(test_key) + "/12345" test_name = str(test_key) test_diets = [ 'Sensitive Digestion', 'Limited Ingredient Diet', 'No Corn No Wheat No Soy', 'Grain-Free' ] test_food = Food(item_num=test_key, name=test_name, url=test_url, ingredients="None", brand="None", xsm_breed=False, sm_breed=False, md_breed=False, lg_breed=False, xlg_breed=False, food_form="None", lifestage="None", fda_guidelines=False) self.assertFalse(self.s._check_db_for_food(url=test_url)) self.s._enter_in_db(test_food, test_diets) self.assertTrue(self.s._check_db_for_food(url=test_url)) def test__enqueue_url(self): def dummy_function(): pass self.s._enqueue_url("www.test.com", dummy_function) url, func = self.s.scrape_queue.get() self.assertEqual("www.test.com", url) self.assertEqual(dummy_function, func) def test__check_db_for_food(self): self.assertTrue(self.s._check_db_for_food(url="www.test.com/1/54321")) self.assertFalse( self.s._check_db_for_food( url="this entry is not in the database/12345")) def test__check_ingredients(self): food1 = Food(ingredients="chicken, lentils, potatoes - this one's bad") food2 = Food(ingredients="just chicken in this food - its good!") food3 = Food( ingredients= "this food has good ingredients, vitamins and minerals, then sweet potatoes - ok!" ) food1 = self.s._check_ingredients(food1) food2 = self.s._check_ingredients(food2) food3 = self.s._check_ingredients(food3) self.assertEqual(False, food1.fda_guidelines) self.assertEqual(True, food2.fda_guidelines) self.assertEqual(True, food3.fda_guidelines) def test__make_request(self): r1 = self.s._make_request("https://www.google.com/") r2 = self.s._make_request("https://www.google.com/notarealsite") self.assertEqual(200, r1.status_code) self.assertEqual(404, r2.status_code) def test__pages_of_results(self): results = self.s._pages_of_results( 'https://www.chewy.com/s?rh=c%3A288%2Cc%3A332&page=1') self.assertEqual(101, results) results = self.s._pages_of_results( 'https://www.chewy.com/s?rh=c%3A288%2Cc%3A332%2Cc%3A294') self.assertEqual(43, results) def test__new_total_count_greaterthan_last(self): new_total = 3600 self.assertFalse(self.s._new_total_count_greaterthan_last(new_total)) new_total = 3603 self.assertFalse(self.s._new_total_count_greaterthan_last(new_total)) new_total = 3604 self.assertTrue(self.s._new_total_count_greaterthan_last(new_total))
logger.setLevel(logging.INFO) file_handler = logging.FileHandler('app.log') stream_handler = logging.StreamHandler() file_handler.setFormatter(formatter) stream_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.addHandler(stream_handler) target_class_names = 'post-6 page type-page status-publish hentry' target_id_name = 'post-6' html_element = 'article' target_url = 'http://bit.lk/index.php/category/announcement/page/1' scraper = Scraper() fetcher = DocumentFetcher() datetime_updater = DateTimeUpdater(fetcher, scraper) database = db_connection() factory = AnnouncementFactory() try: stored_collection = AnnouncementMapper(database).get_recent_announcements(factory) subscribers = SubscriberMapper(JsonAdapter('./subscribers.json')).get_all_subscribers(SubscriberFactory()) web_page = fetcher.fetch_document(target_url) web_collection = scraper \ .set_html_document(web_page) \ .extract_html(html_element, target_id_name, target_class_names) \ .get_announcements(AnnouncementFactory())
import numpy as np from yahooquery import Ticker import multiprocessing as mp ## start the nameserver pyro4-ns [options] from scraper import Scraper if __name__ == '__main__': ############################################# # 1. TO-DO: REPLACE THIS STATIC tickerName LIST WITH ONE THAT READS A CSV FILE FOR THE tickerNameS TickerNames = pd.read_csv('./src/tickerNames/TickerNames.csv') TickerNames = TickerNames.values ############################################ # Multiprocessing # processes = [] print("Fetching Data from Stocks") scraper = Scraper('ETH-USD') scraper.update() # for tickerName in TickerNames: # scraper = Scraper(tickerName[0]) # process = mp.Process(target=scraper.update, args=()) # process.start() # processes.append(process) # for process in processes: # process.join()
import datetime from scraper import Scraper from colorama import init, Fore, Back, Style best_rooms = ["Aula Be", "Aula Ve", "Aula Oe"] fine_rooms = ["Aula Ae", "Aula Pe", "Aula Ie", "Aula Qe", "Aula Ke"] shitty_rooms = ["Aula Ge", "Aula Ce", "Aula De", "Aula Je", "Aula Le", "Aula Se"] init() now = datetime.datetime.now() scraper = Scraper() data = scraper.get_room_list() free_room = [] for room in data: if room.is_free(now) : # print(room.name) free_room.append(room) print("AULE LIBERE ALLE ORE: " + str(now.time().hour) + ":" + str(now.time().minute)) for room in free_room:
def test_credentials_invalid(): s = Scraper(credentials=('a', 'b')) assert not s.verify_chinesepod_credentials()
def get_query_info(url): try: scrape = Scraper(url) except (URLError, BadStatusLine) as err: print "url error: ", url, err return None scrape.move_to('<pre id="queryBodyText" class="cm-s-default">') sql_raw = scrape.pull_from_to('<code>', '</code>') sql = clean_sql(sql_raw) scrape.move_to('> created <span') ts = scrape.pull_from_to('title="', '" class="relativetime"') scrape.move_to('<div class="user-gravatar32">') if scrape.comes_before('<a href="/users', '</div>'): user_id = scrape.pull_from_to('<a href="/users/', '"><img ') scrape.move_to('<a href="/users/') user_name = scrape.pull_from_to('>', '</a>') else: user_id = None user_name = None return (ts, url, sql, user_id, user_name)
from flask import Flask, render_template from flask_app.forms import ScrapeSingleTermTestForm from config import FlaskConfig from scraper import Scraper from twitter import TwitterAuthentication current_folder = os.path.abspath(os.path.dirname(__file__)) static_folder = os.path.join(current_folder, 'static') template_folder = os.path.join(current_folder, 'templates') app = Flask(__name__, static_url_path='/static', template_folder=template_folder, static_folder=static_folder) app.debug = FlaskConfig.DEBUG scraper = Scraper(twitter_authentication=TwitterAuthentication.autodetect_twitter_auth()) @app.route('/') def index(): return render_template('index.html') @app.route("/success") def success(): return "Whatever you just tried worked. Congrats :)" @app.route('/scrape_term', methods=("GET", "POST")) def scrape_term(): form = ScrapeSingleTermTestForm()
def main(argv=sys.argv[1:]): """ Main method. Reads URL list and initializes scaper and observers """ if sys.version_info < (3, 8): sys.exit( f'Python minumum version required is 3.8. Current version is {sys.version_info}' ) # parse arguments parser = argparse.ArgumentParser( description= 'URLs health monitor. If no parameters passed is run with defaults.') parser.add_argument('-urlc', '--urlconfig', dest='urls', metavar='path/to/config.json', type=filePath, help='Path to urls to be watched.') parser.add_argument('-osurls', dest='osurls', metavar='path/to/osa_urls.txt', type=filePath, help='Path to urls to execute OS command') parser.add_argument('-apiurls', dest='apiurls', metavar='path/to/apia_urls.txt', type=filePath, help='Path to urls to execute API command') args = parser.parse_args(argv) # determ root folder based on current file location rootFolder = pathlib.Path(__file__).parent.parent # get default file names, if no arguments urlsFile = args.urls if args.urls else rootFolder.joinpath( Config.WATCHER_URLS) osurlsFile = args.osurls if args.osurls else rootFolder.joinpath( Config.OSACTION_URLS) apiurlsFile = args.apiurls if args.apiurls else rootFolder.joinpath( Config.APIACTION_URLS) with open(urlsFile) as ufile: urls = json.load(ufile) # transform json to objects of type WatcherConfig urlsObjects = [ WatcherConfig( url=url['url'], checkPeriodInSec=url['checkPeriodInSec'], requestParams=url.get('requestParams', {}), responseCodesHealthy=set(url['responseCodesHealthy']), responseBodyHealthyRegex=url.get('responseBodyHealthyRegex'), responseHeadersHealthyRegex=url.get('responseHeadersHealthyRegex')) for url in urls ] with open(osurlsFile) as ufile: # read all lines as set osurls = set(map(str.strip, ufile)) with open(apiurlsFile) as ufile: # read all lines as set apiurls = set(map(str.strip, ufile)) # init scraper for the URLs list scraper = Scraper(urlsObjects) # attach observers to receive notification on fail scraper.attach( OsAction(scope=osurls, name='Test-OSPing', cmd=('ping', '127.0.0.1', '-n', '1'), waitToComplete=True)) scraper.attach(ApiCall(scope=apiurls, name='Test-Sendemail')) # run scraper asyncio.run(scraper.run())
from scraper import Scraper from telegram_bot import TelegramBot import schedule import time import datetime MINUTES = 5 MONTH = ['2020-01'] NUM_CONSECUTIVE = 1 scraper = Scraper(appointment_type="Researcher") # possible options are ["Researcher", "BlueCard"] bot = TelegramBot() def get_timestamp(): return datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %X") def get_message(appointments, consecutive=1): print(appointments) free_appointments = [] message = "" send_message = False for date in appointments: value = appointments[date] if len(value) > 0: for i in range(0, len(value)): timestamp = time.mktime(time.strptime(value[i], '%H:%M')) j = i + 1 while j < len(value): timestamp_next = time.mktime(time.strptime(value[j], '%H:%M')) diff = timestamp_next - timestamp if diff/60 <= 30 * (j - i): j = j+1
def getxactlist(self, facade, accountList, token, step, allofit): logging.info(" --> STEP-" + str(step)) Scraper.doscrape(self, facade, accountList, token) self.wipe_pages() scrape_result = "good" self.response = {} self.prepParse(allofit, facade, token) page = self.HexToByte(allofit['body']) # if its early steps login and get the accounts if step < 7: scrape_result = self.gettheAccounts(facade, accountList, token, step, allofit, page) if step == 4 or step == 5 or step == 6: # if that went well and we now have a list if scrape_result == "got list": # test that we have some account links acCount = 0 accounts = [] for ac in self.accountLinks: acName = self.accountLinks[acCount].string acLink = self.accountLinks[acCount]['href'] # simple lookup here as login sets up mayAccounts in NatWest acpair = self.myAccounts[acCount] # tha unique name defining this account to link on (eg accont number) acID = None # if we did detect an account number if acpair != None: logging.debug('got a pair') acID = acpair['num'] # then use this # now see if we hav this id in our list of accounts to get the correct path to add the xactions to theAccountPath = [] if acID in accountList: theAccountPath = accountList[acID] account = {} account['accountid'] = acID account['href'] = acLink account['path'] = theAccountPath account['synched'] = False account['type'] = acpair['type'] account['balance'] = acpair['bal'] accounts.append(account) else: logging.warn("N-B - account not in users list " + acName + ' - ' + acID) logging.info('-------------->' + str(acID)) logging.info('-------------->' + str(theAccountPath)) logging.info('-------------->' + str(acLink)) logging.info('-------------->' + str(acName)) acCount = acCount + 1 accounts[0]['synched'] = True self.response['accountlist'] = accounts # TODO - choose the right url - the first on the list self.response['url'] = self.ByteToHex(accounts[0]['href']) self.response['data'] = "" self.response['method'] = 'GET' # 7 or twelve cc or normal if accounts[0]['type'] == 'Credit': self.response['step'] = 12 else: self.response['step'] = 7 self.response['accountid'] = accounts[0]['accountid'] self.response['accountpath'] = accounts[0]['path'] scrape_result = "account list" elif scrape_result != "good": self.flush_pages() return scrape_result if step > 6: accounts = allofit['accountlist'] # detals of the currently synching account # account number - id etc acName = allofit['accountid'] theAccountPath = allofit['accountpath'][:] bankurl = self.HexToByte(allofit['bankurl']) logging.debug("PROCESSING - -- -> " + bankurl) logging.debug(" acName - " + acName + " acPath - " + str(theAccountPath)) actype = None acbal = 0 # first recieved next link towards accountlist for normal accounts if step == 7: logging.info('STEP 7') scrape_result = self.doStep7(page) # first recieved next link towards accountlist for cc accounts if step == 12: logging.info('STEP 12') scrape_result = self.doStep12(page) # actually got a transaction list page - so check for an all link if step == 20: # now find this account to get the extra bits of info result = self.doAllLink(page) # if we allready have all xacts as per ususal then trigger step 30 with this page if result == "allxacts": step = 30 else: logging.debug("need to get the page of all accounts") # copy all the response stuff if step > 5 and step < 30: self.response['accountid'] = acName self.response['accountpath'] = theAccountPath self.response['accountlist'] = accounts scrape_result = "account list" # actually got a transaction list page if step == 30: # now find this account to get the extra bits of info for ac in accounts: if ac['accountid'] == acName: acType = ac['type'] balance = ac['balance'] if acType == 'Credit': result = self._processCCAccount(page, theAccountPath, balance) else: result = self._processNormAccount(page, theAccountPath, balance) # good or bad move onto next account - good plan?? # if trying to grab the account detail pages and they return bad - move on as well # remember all the pre acc list return scrape_result = "account list" on success if step == 30 or (step > 5 and scrape_result != 'account list'): url = "" acind = 0 acit = 0 # now work out the next url for acc in accounts: if not acc['synched']: acind = acit url = acc['href'] break acit = acit + 1 self.response = {} self.response['data'] = "" self.response['accountid'] = "" self.response['accountpath'] = "" if url == "": self.response['url'] = "" self.response['method'] = 'END' else: self.response['url'] = self.ByteToHex(url) accounts[acind]['synched'] = True self.response['method'] = 'GET' self.response['accountid'] = accounts[acind]['accountid'] self.response['accountpath'] = accounts[acind]['path'] # 6 or twelve cc or normal if accounts[acind]['type'] == 'Credit': self.response['step'] = 12 else: self.response['step'] = 7 self.response['accountlist'] = accounts scrape_result = "got account" return scrape_result
def test_Al2O3_scraper(self): """Tests whether all 4 records for "Al2O3" are scraped correctly.""" # Initialize a `scraper.Scraper` instance. scraper = Scraper() scraper.get_landing_page() # Is the landing page correct? self.assertIn('NIST-JANAF Thermochemical Tables', scraper.browser.title) # Enter "Al2O3" in the form, submit it. scraper.send_query('Al2O3') scraper.select_state() scraper.submit_query() # Get all records resulting from the above query query_records = scraper.all_query_records # Verify number of records scraped. self.assertEqual(len(query_records), 4) # Check if scraped data is OK. self.assertIn('aluminum_oxide__kappa', query_records) self.assertEqual(query_records['aluminum_oxide__alpha']['CAS'], '1344-28-1') self.assertEqual(query_records['aluminum_oxide__delta']['formula'], 'Al2O3') self.assertEqual(query_records['aluminum_oxide__gamma']['link'], 'http://kinetics.nist.gov/janaf/html/Al-098.txt') # Terminate the session cleanly. scraper.terminate_session()
def lambda_handler(event, context): scrape = Scraper() scrape.scrape() # get info scrape.verify() # sort info scrape.store() # store info return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}
def __init__(self, url, quality): self.url = url self.quality = quality self.scraper = Scraper(url, quality)
def download_single(self, obj): images = Scraper.getChapter(obj) return self.download_manga(images[0], images[1])
class LSASumarizer(): def __init__(self, language): dir_path = os.path.dirname(os.path.realpath(__file__)) + '/nltk_data/' nltk.data.path = [dir_path] self.stopwords = sw.words(language) if self.stopwords is None: self.stopwords = [] self.scraper = Scraper(language) self.language = language if language == INDONESIAN: factory = StemmerFactory() self.stemmer = factory.create_stemmer() else: self.stemmer = PorterStemmer() def _build_feature_matrix(self, documents, feature_type='frequency'): def _tokenize(sentence): return [ self.stemmer.stem(w.lower()) for w in word_tokenize(sentence) ] feature_type = feature_type.lower().strip() if feature_type == 'binary': vectorizer = CountVectorizer(binary=True, min_df=1, stop_words=self.stopwords, tokenizer=_tokenize, ngram_range=(1, 2)) elif feature_type == 'frequency': vectorizer = CountVectorizer(binary=False, min_df=1, analyzer='word', stop_words=self.stopwords, tokenizer=_tokenize, ngram_range=(1, 2)) elif feature_type == 'tfidf': vectorizer = TfidfVectorizer(min_df=1, stop_words=self.stopwords, tokenizer=_tokenize, ngram_range=(1, 2)) else: raise Exception( "Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'" ) feature_matrix = vectorizer.fit_transform(documents).astype(float) return vectorizer, feature_matrix def _low_rank_svd(self, matrix, singular_count=2): u, s, vt = svds(matrix, k=singular_count) return u, s, vt def _summarize(self, document, num_sentences=2, num_topics=1, feature_type='frequency', sv_threshold=0.5): sentences = sent_tokenize(document) vec, dt_matrix = self._build_feature_matrix(sentences, feature_type=feature_type) td_matrix = dt_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) u, s, vt = self._low_rank_svd(td_matrix, singular_count=num_topics) min_sigma_value = max(s) * sv_threshold s[s < min_sigma_value] = 0 salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt))) top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1] top_sentence_indices.sort() result = "" for index in top_sentence_indices: result += sentences[index] return result def summarize(self, query=None, size=2, text=None): suggested_query = None lang = None status = 0 if query: for ch in ['&', ':', '-', '+', '.', ',']: query = query.replace(ch, ' ') query = re.sub('[^ 0-9a-zA-Z]+', '', query) words = word_tokenize(query.lower()) filtered_words = [ word for word in words if word not in self.stopwords and word.isalnum() ] new_query = " ".join(filtered_words) suggested_query, status, lang = self.scraper.get_query(new_query) if status == -1: suggested_query, status, lang = self.scraper.get_query( new_query, isInverse=True) if status == -1: suggested_query, status, lang = self.scraper.get_query( new_query) text = text if text else self.scraper.get_intro_lang( suggested_query, lang) # remove formula notation and multiple spaces text = re.sub('{.+}', '', text) text = re.sub('\s+', ' ', text) if not text: if self.language == INDONESIAN: return "mohon maaf {q} tidak ditemukan".format(q=query) else: return "{q} not found".format(q=query) summary = self._summarize(text, size) if status == 0: return summary elif lang == self.language: if lang == INDONESIAN: return "mungkin maksud anda adalah {sq}\n{s}".format( sq=suggested_query, s=summary) else: return "maybe this is what you want {sq}\n{s}".format( sq=suggested_query, s=summary) else: return summary
class TextRankSummarizer(object): def __init__(self, language): dir_path = os.path.dirname(os.path.realpath(__file__)) + '/nltk_data/' nltk.data.path = [dir_path] self.stopwords = sw.words(language) self.scraper = Scraper(language) self.language = language if language == INDONESIAN: factory = StemmerFactory() self.stemmer = factory.create_stemmer() else: self.stemmer = PorterStemmer() def sentence_similarity(self, sentence1, sentence2): if self.stopwords is None: self.stopwords = [] sentence1 = [ self.stemmer.stem(w.lower()) for w in word_tokenize(sentence1) ] sentence2 = [ self.stemmer.stem(w.lower()) for w in word_tokenize(sentence2) ] all_words = list(set(sentence1 + sentence2)) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) for w in sentence1: if w in self.stopwords: continue vector1[all_words.index(w)] += 1 for w in sentence2: if w in self.stopwords: continue vector2[all_words.index(w)] += 1 return 1 - cosine_distance(vector1, vector2) def build_similarity_matrix(self, sentences): similarity_matrix = np.zeros((len(sentences), len(sentences))) for idx1 in range(len(sentences)): for idx2 in range(len(sentences)): if idx1 == idx2: continue similarity_matrix[idx1][idx2] = self.sentence_similarity( sentences[idx1], sentences[idx2]) for idx in range(len(similarity_matrix)): if similarity_matrix[idx].sum() != 0: similarity_matrix[idx] /= similarity_matrix[idx].sum() return similarity_matrix def page_rank(self, similarity_matrix, eps=0.0001, d=0.85): probs = np.ones(len(similarity_matrix)) / len(similarity_matrix) while True: new_probs = np.ones(len(similarity_matrix)) * (1 - d) / len( similarity_matrix) + d * similarity_matrix.T.dot(probs) delta = abs((new_probs - probs).sum()) if delta <= eps: return new_probs probs = new_probs def summarize(self, query=None, size=1, text=None): suggested_query = None lang = None status = 0 if query: for ch in ['&', ':', '-', '+', '.', ',']: query = query.replace(ch, ' ') words = word_tokenize(query.lower()) filtered_words = [ word for word in words if word not in self.stopwords and word.isalnum() ] new_query = " ".join(filtered_words) suggested_query, status, lang = self.scraper.get_query(new_query) if status == -1: suggested_query, status, lang = self.scraper.get_query( new_query, isInverse=True) if status == -1: suggested_query, status, lang = self.scraper.get_query( new_query) text = text if text else self.scraper.get_intro_lang( suggested_query, lang) # remove formula notation and multiple spaces text = re.sub('{.+}', '', text) text = re.sub('\s+', ' ', text) if not text: if self.language == INDONESIAN: return "mohon maaf {q} tidak ditemukan".format(q=query) else: return "{q} not found".format(q=query) sentences = sent_tokenize(text) similarity_matrix = self.build_similarity_matrix(sentences) sentence_ranks = self.page_rank(similarity_matrix) ranked_sentence_indexes = [ item[0] for item in sorted(enumerate(sentence_ranks), key=lambda item: -item[1]) ] selected_sentences = sorted(ranked_sentence_indexes[:size]) summary = itemgetter(*selected_sentences)(sentences) if isinstance(summary, tuple): if status == 0: return ' '.join(summary) elif lang == self.language: res = ' '.join(summary) if lang == INDONESIAN: return "mungkin maksud anda adalah {sq}\n{s}".format( sq=suggested_query, s=res) else: return "maybe this is what you want {sq}\n{s}".format( sq=suggested_query, s=res) else: return summary if status == 0: return summary elif lang == self.language: if lang == INDONESIAN: return "mungkin maksud anda adalah {sq}\n{s}".format( sq=suggested_query, s=summary) else: return "maybe this is what you want {sq}\n{s}".format( sq=suggested_query, s=summary) else: return summary
from dbconnector import DBConnector, DBExplorer from scraping_functions import extract_competition_information from scraper import Scraper import argparse # ========================================================================== # Functions # ========================================================================== # ========================================================================== # Params and definitions # ========================================================================== db_object = DBExplorer(data_base='laliga_scraping', table='results') game_ref_extractor = Scraper( url= 'https://resultados.as.com/resultados/futbol/primera/{season}/jornada/regular_a_{rnd}' ) results_extractor = Scraper( url="https://resultados.as.com{game_ref}/estadisticas/") configuration = LaLigaConfig() # ========================================================================== # Code # ========================================================================== if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--season", help="Season to extract the data", default='2018_2019')
def setup_class(cls): url = 'http://books.toscrape.com/index.html' cls.site1 = Scraper(None) cls.site2 = Scraper(url)