def __init__(self, tile_name, column_name, pairs): self.db = DbManager.get_instance() self.scraper = WebScraper() self.cursor = None self.logger = Logger() self.column_name = column_name self.tile_name = tile_name self.pairs = pairs
def get_extra_player_info(player): page = WebScr.get_soup(player['link']) # Secondary position div = page.find('div', {'class': 'nebenpositionen'}) if div: txt = div.text position_2 = txt[txt.find(':') + 1:].strip() if len(position_2) > 25: position_2 = position_2[:25].strip() player['position_2'] = position_2 # Second team table = page.find('table', {'class': 'auflistung'}) rows = table.find_all('tr') for row in rows: if row.th.text.strip() == "Club actual:": team_1 = row.td.a['id'] player['team'] = team_1 if row.th.text.strip() == "2do club:": team_2 = row.td.a['id'] player['team_2'] = team_2 elif row.th.text.strip() == "3er club:": team_3 = row.td.a['id'] player['team_3'] = team_3 logging.warning( "Player with 3 clubs: {}".format(player['link'])) return player
def iterate_teams(self, extra_team_info=True, players=True, extra_player_info=True): self.timer.start() for id_league in self.leagues: teams = self.leagues[id_league]['teams'] for id_team in teams: team_link = teams[id_team]['link'] page = WebScr.get_soup(team_link) if extra_team_info: img = page.find('img', {'alt': teams[id_team]['name']}) if img: teams[id_team]['img'] = img['src'].replace( "https", "http") has_players = int( page.find("span", {"class": "dataValue"}).text.strip()) if players and has_players: self.get_players(page, id_league, extra_player_info) self.timer.add_done() self.timer.print_left() tmp_output = {'leagues': self.leagues, 'players': self.players} with open('tmp_output/{}.json'.format(id_league), 'w') as f: json.dump(tmp_output, f, indent=4, sort_keys=True)
def run(start_page=1): all_blog_urls = [] scraper = WebScraper() pages_number = scraper.get_pages_number() for page_no in range(start_page, pages_number + 1): blog_urls = scraper.get_blogs_from_page(page_no) all_blog_urls.extend(blog_urls) print("{}/{}".format(page_no, pages_number + 1)) with open("blog_urls_all.dat", "wb") as f: pickle.dump(all_blog_urls, f) print(len(all_blog_urls))
def __init__(self, type): self.scraper = WebScraper() self.logger = Logger() self.db = DbManager.get_instance() if type == 'fill_database': self.__init_fill_db()
class Worker: def __init__(self, tile_name, column_name, pairs): self.db = DbManager.get_instance() self.scraper = WebScraper() self.cursor = None self.logger = Logger() self.column_name = column_name self.tile_name = tile_name self.pairs = pairs def start_working(self): for pair in self.pairs: if pair[0] in Settings.BANNED_PAIRS: continue done, pages, products = self.db.get_progress_for_pair(pair[0]) if done: continue pair_total_products = self.scraper.get_total_products(pair[1]) n_prods = 0 coroutine = self.scraper.all_prods_in_url(pair[1], pages, products) self.db.execute_query(Query.init_progress_category.format(pair[0])) while True: try: page, prods = coroutine.send(None) except StopIteration: break n_prods += len(prods) for prod in prods: self.db.execute_query( Query.insert_products.format( self.tile_name, prod.link, prod.title.replace("'", "''"), (0 if prod.old_price is None else prod.old_price), prod.new_price, self.column_name, pair[0])) self.logger.committed_products(len(prods), pair[0], page) self.db.execute_query( Query.update_progress_table.format(page, n_prods, pair[0])) self.db.execute_query( Query.update_progress_done_column.format(pair[0]))
def __init__(self): available_hosts = ['rpi1', 'rpi2'] self.logger = Logger() self.scraper = WebScraper() self.db = DbManager.get_instance() depts = self.scraper.get_all_departments() workers = [] for tile in depts: if good_table_name(tile) in Settings.BANNED_TILES: continue self.db.create_table(good_table_name(tile)) for column in depts[tile]: if column in Settings.BANNED_COLUMNS: continue worker = Worker(good_table_name(tile), column, depts[tile][column]) proc = multiprocessing.Process(target=worker.start_working) self.logger.starting_worker(tile, column) workers.append((worker, proc)) proc.start() time.sleep(5) while len(workers) >= 4: for w, p in workers: p.join(timeout=0) if not p.is_alive(): workers.remove((w, p)) break def start_ssh_job(self, host, tile): timeout = 60 * 60 * 2 process = subprocess.Popen([ 'ssh', host, '"python3 emag-scraping/process.py {}"'.format(tile) ]) time.sleep(5) if process.poll() is not None: print('Done {}'.format(tile))
def get_extra_player_info(player): page = WebScr.get_soup(player['link']) # Secondary position div = page.find('div', {'class': 'nebenpositionen'}) if div: txt = div.text position_2 = txt[txt.find(':') + 1:].strip() if len(position_2) > 25: position_2 = position_2[:25].strip() player['position_2'] = position_2 return player
def iterate_teams(self, extra_team_info=True, players=True, extra_player_info=True): teams = self.teams for id_team in teams: team_link = teams[id_team]['link'] nation = teams[id_team]['name'].split(' ')[0] page = WebScr.get_soup(team_link) if extra_team_info: img = page.find('img', {'alt': teams[id_team]['name']}) if img: teams[id_team]['img'] = img['src'].replace( "https", "http") if players: self.get_players(page, id_team, nation, extra_player_info)
def runMusic(self, query): if path.isfile('cookies.pkl') is True: cookies = pickle.load(open("cookies.pkl", "rb")) driver = WebScraper(self.url).setWebdriver(False) for cookie in cookies: driver.add_cookie(cookie) driver.refresh() play = self.search_playlist(driver, query) else: driver = self.log_into_spotify() play = self.search_playlist(driver, query) return play.click()
def runMusic(self, query): driver = WebScraper(self.url).setWebdriver(True) search_box = driver.find_element_by_xpath( '//*[@id="content"]/div/div/div[2]/div/div[1]/span/span/form/input' ) search_box.send_keys(query) time.sleep(2) driver.find_element_by_xpath( '//*[@id="content"]/div/div/div[2]/div/div[1]/span/span/form/button' ).click() time.sleep(2) play_button = driver.find_element_by_xpath( '//*[@id="content"]/div/div/div[3]/div/div/div/ul/li[1]/div/div/div/div[2]/div[1]/div/div/div[1]/a' ) music_play = play_button.click() return music_play
def __init__(self): self.wait_for_market_open() # dictionary containing all of the day's information, stored at end of day self.record = {"date": str(datetime.date.today()), "starting": None, "ending": None, "profit": None, "stocks": []} # initiatilize WebScraper to get top five gainers of the day ws = WebScraper() self.stocks = ws.stocks self.record["stocks"] = ws.stocks self.login() # keys: stocks, values: buying power allocated to stock self.funds = {} self.start_funds = self.split_funds() # boolean dictionary: holds number of shares, None otherwise self.bought = {} for s in self.stocks: self.bought[s] = None self.trade() self.logout()
def get_rows_from_link(url): page = WebScr.get_soup(url) if page: return TransferMarkt.get_rows_from_page(page)
def log_into_spotify(self): self.get_login() driver = WebScraper(self.url).setWebdriver(False) cookies_pop = WebDriverWait(driver, 2).until( ec.presence_of_element_located( (By.XPATH, '//*[@id="onetrust-accept-btn-handler"]'))) cookies_pop.click() driver.implicitly_wait(3) loggin_sign = driver.find_element_by_xpath( '//*[@id="main"]/div/div[2]/div[1]/header/div[5]/button[2]') loggin_sign.click() driver.implicitly_wait(3) username_field = driver.find_element_by_xpath( '//*[@id="login-username"]') username_field.send_keys(self.username) time.sleep(2) password_field = driver.find_element_by_xpath( '//*[@id="login-password"]') password_field.send_keys(self.password) time.sleep(2) button_login = driver.find_element_by_xpath('//*[@id="login-button"]') button_login.click() time.sleep(3) pickle.dump(driver.get_cookies(), open("cookies.pkl", "wb")) return driver