コード例 #1
0
 def __init__(self, tile_name, column_name, pairs):
     self.db = DbManager.get_instance()
     self.scraper = WebScraper()
     self.cursor = None
     self.logger = Logger()
     self.column_name = column_name
     self.tile_name = tile_name
     self.pairs = pairs
コード例 #2
0
    def get_extra_player_info(player):
        page = WebScr.get_soup(player['link'])

        # Secondary position
        div = page.find('div', {'class': 'nebenpositionen'})
        if div:
            txt = div.text
            position_2 = txt[txt.find(':') + 1:].strip()
            if len(position_2) > 25:
                position_2 = position_2[:25].strip()
            player['position_2'] = position_2

        # Second team
        table = page.find('table', {'class': 'auflistung'})
        rows = table.find_all('tr')
        for row in rows:
            if row.th.text.strip() == "Club actual:":
                team_1 = row.td.a['id']
                player['team'] = team_1
            if row.th.text.strip() == "2do club:":
                team_2 = row.td.a['id']
                player['team_2'] = team_2
            elif row.th.text.strip() == "3er club:":
                team_3 = row.td.a['id']
                player['team_3'] = team_3
                logging.warning(
                    "Player with 3 clubs: {}".format(player['link']))

        return player
コード例 #3
0
    def iterate_teams(self, extra_team_info=True, players=True, extra_player_info=True):
        self.timer.start()
        for id_league in self.leagues:
            teams = self.leagues[id_league]['teams']
            for id_team in teams:
                team_link = teams[id_team]['link']
                page = WebScr.get_soup(team_link)

                if extra_team_info:
                    img = page.find('img', {'alt': teams[id_team]['name']})
                    if img:
                        teams[id_team]['img'] = img['src'].replace(
                            "https", "http")

                has_players = int(
                    page.find("span", {"class": "dataValue"}).text.strip())

                if players and has_players:
                    self.get_players(page, id_league, extra_player_info)

                self.timer.add_done()
                self.timer.print_left()
            tmp_output = {'leagues': self.leagues, 'players': self.players}
            with open('tmp_output/{}.json'.format(id_league), 'w') as f:
                json.dump(tmp_output, f, indent=4, sort_keys=True)
コード例 #4
0
def run(start_page=1):
    all_blog_urls = []

    scraper = WebScraper()

    pages_number = scraper.get_pages_number()

    for page_no in range(start_page, pages_number + 1):
        blog_urls = scraper.get_blogs_from_page(page_no)
        all_blog_urls.extend(blog_urls)
        print("{}/{}".format(page_no, pages_number + 1))

    with open("blog_urls_all.dat", "wb") as f:
        pickle.dump(all_blog_urls, f)

    print(len(all_blog_urls))
コード例 #5
0
    def __init__(self, type):
        self.scraper = WebScraper()
        self.logger = Logger()
        self.db = DbManager.get_instance()

        if type == 'fill_database':
            self.__init_fill_db()
コード例 #6
0
class Worker:
    def __init__(self, tile_name, column_name, pairs):
        self.db = DbManager.get_instance()
        self.scraper = WebScraper()
        self.cursor = None
        self.logger = Logger()
        self.column_name = column_name
        self.tile_name = tile_name
        self.pairs = pairs

    def start_working(self):
        for pair in self.pairs:
            if pair[0] in Settings.BANNED_PAIRS:
                continue

            done, pages, products = self.db.get_progress_for_pair(pair[0])
            if done:
                continue

            pair_total_products = self.scraper.get_total_products(pair[1])

            n_prods = 0
            coroutine = self.scraper.all_prods_in_url(pair[1], pages, products)
            self.db.execute_query(Query.init_progress_category.format(pair[0]))

            while True:
                try:
                    page, prods = coroutine.send(None)
                except StopIteration:
                    break

                n_prods += len(prods)

                for prod in prods:
                    self.db.execute_query(
                        Query.insert_products.format(
                            self.tile_name, prod.link,
                            prod.title.replace("'", "''"),
                            (0 if prod.old_price is None else prod.old_price),
                            prod.new_price, self.column_name, pair[0]))

                self.logger.committed_products(len(prods), pair[0], page)

                self.db.execute_query(
                    Query.update_progress_table.format(page, n_prods, pair[0]))
            self.db.execute_query(
                Query.update_progress_done_column.format(pair[0]))
コード例 #7
0
    def __init__(self):
        available_hosts = ['rpi1', 'rpi2']

        self.logger = Logger()
        self.scraper = WebScraper()
        self.db = DbManager.get_instance()
        depts = self.scraper.get_all_departments()
        workers = []

        for tile in depts:
            if good_table_name(tile) in Settings.BANNED_TILES:
                continue
            self.db.create_table(good_table_name(tile))
            for column in depts[tile]:
                if column in Settings.BANNED_COLUMNS:
                    continue

                worker = Worker(good_table_name(tile), column,
                                depts[tile][column])

                proc = multiprocessing.Process(target=worker.start_working)
                self.logger.starting_worker(tile, column)

                workers.append((worker, proc))
                proc.start()
                time.sleep(5)

                while len(workers) >= 4:
                    for w, p in workers:
                        p.join(timeout=0)
                        if not p.is_alive():
                            workers.remove((w, p))
                            break

        def start_ssh_job(self, host, tile):
            timeout = 60 * 60 * 2

            process = subprocess.Popen([
                'ssh', host,
                '"python3 emag-scraping/process.py {}"'.format(tile)
            ])
            time.sleep(5)

            if process.poll() is not None:
                print('Done {}'.format(tile))
コード例 #8
0
    def get_extra_player_info(player):
        page = WebScr.get_soup(player['link'])

        # Secondary position
        div = page.find('div', {'class': 'nebenpositionen'})
        if div:
            txt = div.text
            position_2 = txt[txt.find(':') + 1:].strip()
            if len(position_2) > 25:
                position_2 = position_2[:25].strip()
            player['position_2'] = position_2

        return player
コード例 #9
0
    def iterate_teams(self, extra_team_info=True, players=True, extra_player_info=True):
        teams = self.teams
        for id_team in teams:
            team_link = teams[id_team]['link']
            nation = teams[id_team]['name'].split(' ')[0]
            page = WebScr.get_soup(team_link)

            if extra_team_info:
                img = page.find('img', {'alt': teams[id_team]['name']})
                if img:
                    teams[id_team]['img'] = img['src'].replace(
                        "https", "http")

            if players:
                self.get_players(page, id_team, nation, extra_player_info)
コード例 #10
0
ファイル: MediaSpotify.py プロジェクト: Aude11/Playlist
 def runMusic(self, query):
     if path.isfile('cookies.pkl') is True:
         cookies = pickle.load(open("cookies.pkl", "rb"))
         driver = WebScraper(self.url).setWebdriver(False)
         for cookie in cookies:
             driver.add_cookie(cookie)
             driver.refresh()
         play = self.search_playlist(driver, query)
     else:
         driver = self.log_into_spotify()
         play = self.search_playlist(driver, query)
     return play.click()
コード例 #11
0
ファイル: MediaSoundcloud.py プロジェクト: Aude11/Playlist
 def runMusic(self, query):
     driver = WebScraper(self.url).setWebdriver(True)
     search_box = driver.find_element_by_xpath(
         '//*[@id="content"]/div/div/div[2]/div/div[1]/span/span/form/input'
     )
     search_box.send_keys(query)
     time.sleep(2)
     driver.find_element_by_xpath(
         '//*[@id="content"]/div/div/div[2]/div/div[1]/span/span/form/button'
     ).click()
     time.sleep(2)
     play_button = driver.find_element_by_xpath(
         '//*[@id="content"]/div/div/div[3]/div/div/div/ul/li[1]/div/div/div/div[2]/div[1]/div/div/div[1]/a'
     )
     music_play = play_button.click()
     return music_play
コード例 #12
0
 def __init__(self):
     self.wait_for_market_open()
     # dictionary containing all of the day's information, stored at end of day
     self.record = {"date": str(datetime.date.today()),
                    "starting": None,
                    "ending": None,
                    "profit": None,
                    "stocks": []}
     # initiatilize WebScraper to get top five gainers of the day
     ws = WebScraper()
     self.stocks = ws.stocks
     self.record["stocks"] = ws.stocks
     self.login()
     # keys: stocks, values: buying power allocated to stock
     self.funds = {}
     self.start_funds = self.split_funds()
     # boolean dictionary: holds number of shares, None otherwise
     self.bought = {}
     for s in self.stocks:
         self.bought[s] = None
     self.trade()
     self.logout()
コード例 #13
0
 def get_rows_from_link(url):
     page = WebScr.get_soup(url)
     if page:
         return TransferMarkt.get_rows_from_page(page)
コード例 #14
0
ファイル: MediaSpotify.py プロジェクト: Aude11/Playlist
 def log_into_spotify(self):
     self.get_login()
     driver = WebScraper(self.url).setWebdriver(False)
     cookies_pop = WebDriverWait(driver, 2).until(
         ec.presence_of_element_located(
             (By.XPATH, '//*[@id="onetrust-accept-btn-handler"]')))
     cookies_pop.click()
     driver.implicitly_wait(3)
     loggin_sign = driver.find_element_by_xpath(
         '//*[@id="main"]/div/div[2]/div[1]/header/div[5]/button[2]')
     loggin_sign.click()
     driver.implicitly_wait(3)
     username_field = driver.find_element_by_xpath(
         '//*[@id="login-username"]')
     username_field.send_keys(self.username)
     time.sleep(2)
     password_field = driver.find_element_by_xpath(
         '//*[@id="login-password"]')
     password_field.send_keys(self.password)
     time.sleep(2)
     button_login = driver.find_element_by_xpath('//*[@id="login-button"]')
     button_login.click()
     time.sleep(3)
     pickle.dump(driver.get_cookies(), open("cookies.pkl", "wb"))
     return driver