class ScrappOlx: def __init__(self): self.log = Logger().custom_logger() self.db_client = DataBaseClient() opts = Options() opts.log.level = "fatal" self.driver = webdriver.Firefox(executable_path=path_to_driver, options=opts) self.driver.implicitly_wait(60) self.wait = WebDriverWait(self.driver, 60) self.start_url = start_url def parse(self): self.driver.get(start_url) self.log.start('Pareser started ad {}'.format(start_url)) self.wait.until( EC.element_to_be_clickable( (By.XPATH, '//*[@id="homeShowAllAds"]'))) self.all_caregories = self.driver.find_elements( By.XPATH, '//div[contains(@class, "subcategories-list")]/div/a[contains(@class, "inlblk")]' ) self.hrefs_to_categorys = (no_blank for no_blank in ( item.get_attribute('href') for item in self.all_caregories) if len(no_blank) > 2) #возвращаем ссылки на все категории return self.hrefs_to_categorys def get_info_category(self): hrefs = self.parse() for item in hrefs: self.first_page = True self.driver.get(item) self.log.info(f'Getting info from category {item}') self.max_page = self.driver.find_elements( By.XPATH, '//span[contains(@class, "item fleft")][last()]')[0].text for number in range(1, int(self.max_page) + 1): if not self.first_page: self.new_url = item + f'?page={number}' self.driver.get(self.new_url) all_records_on_page = self.driver.find_elements( By.XPATH, '//tr[contains(@class, "wrap")]//a[contains(@class, "linkWithHash detailsLink")]' ) href_to_records = [ item.get_attribute('href') for item in all_records_on_page ] self.first_page = False href_to_records = list(set(href_to_records)) self.get_info_record(href_to_records) def get_phone_number(self): try: self.driver.find_element( By.XPATH, '//div[contains(@id, "cookiesBar")]/button[contains(@class, "cookiesBarClose")]' ).click() except Exception as err: pass try: self.wait.until( EC.element_to_be_clickable(( By.XPATH, '//div[contains(@class, "contact-button link-phone")]/strong[contains(@class, "xx-large")]' ))).click() phone = self.driver.find_element( By.XPATH, '//div[contains(@class, "contact-button link-phone")]/strong[contains(@class, "xx-large")]' ).text test = search(r'\d+', phone) except (exceptions.TimeoutException, exceptions.NoSuchElementException) as no_element: test = 1 self.log.warning(f'No phone on record: {self.driver.current_url}') phone = ' ' except exceptions.StaleElementReferenceException as err: self.driver.refresh() try: self.wait.until( EC.element_to_be_clickable(( By.XPATH, '//div[contains(@class, "contact-button link-phone")]/strong[contains(@class, "xx-large")]' ))).click() phone = self.driver.find_element( By.XPATH, '//div[contains(@class, "contact-button link-phone")]/strong[contains(@class, "xx-large")]' ).text test = search(r'\d+', phone) except (exceptions.TimeoutException, exceptions.NoSuchElementException) as no_element: test = 1 self.log.warning( f'No phone on record: {self.driver.current_url}') phone = ' ' while not test: if phone == 'Показать телефон': self.wait.until( EC.element_to_be_clickable(( By.XPATH, '//div[contains(@class, "contact-button link-phone")]/strong[contains(@class, "xx-large")]' ))).click() phone = self.driver.find_element( By.XPATH, '//strong[contains(@class, "xx-large")]').text test = search(r'\d+', phone) return phone def get_info_record(self, hrefs): for item in hrefs: self.log.info(f'Start parse record\n{item}') self.driver.get(item) self.wait.until( EC.element_to_be_clickable( (By.XPATH, '//span[contains(@class, "link inlblk")]'))) try: no_active = driver.find_element(By.XPATH, '//h3/strong').text is_record_active = False except: is_record_active = True if is_record_active: info = self.driver.find_elements( By.XPATH, '//a[contains(@class, "link nowrap")]/span') city = info[0].text.split(' ')[-1] try: record_categoty = f'{info[1].text.replace(city, "")} --> {info[2].text.replace(city, "")}' except: record_categoty = f'{info[1].text.replace(city, "")}' title = self.driver.find_element( By.XPATH, '//div[contains(@class, "offer-titlebox")]/h1').text price = self.driver.find_element( By.XPATH, '//div[contains(@class, "pricelabel")]').text description = self.driver.find_element( By.XPATH, '//div[contains(@id, "textContent")]').text bottombar_items = self.driver.find_elements( By.XPATH, '//div[contains(@id, "offerbottombar")]/ul/li//strong') date_publish = bottombar_items[0].text.replace('в', '') views = bottombar_items[1].text number_record = bottombar_items[2].text name_user = self.driver.find_element( By.XPATH, '//div[contains(@class, "offer-user__actions")]/h4').text phone = self.get_phone_number() try: image_href = self.driver.find_element( By.XPATH, '//div[contains(@id, "descImage")]/img').get_attribute( 'src') except Exception as err: self.log.warning(f'Can not get image href: {err.args}') record_url = self.driver.current_url try: record = ModelRecord(number_record=number_record, record_categoty=record_categoty, title=title, price=price, description=description, date_publish=date_publish, views=views, name_user=name_user, phone=phone, image_href=image_href, record_url=record_url) self.db_client.session.merge(record) self.db_client.session.commit() self.log.info(f'Record {number_record} added to DB') except Exception as err: self.log.error( 'Record {number_record} nont added to DB {err.args}') def __del__(self): self.driver.clsoe() self.log.info('Scrapping end')
def footyBackTest(resultsURLTmpl, opts=sys.argv): (algoCfg, mailCfg) = getFootyConfig() rangeMap = algoCfg['rangeMap'] seasons = algoCfg['seasons'] log = Logger() (sm, rm) = getFootyOptions(log, opts) rangeMap = rm if rm else rangeMap for league in rangeMap.keys(): summaryData = {} with readCSVFileAsDict('{}/{}/Summary.{}.csv'.format( analysisDir, league, model.__class__.__name__)) as summaryReader: for row in summaryReader: mark = int(row['Mark']) summaryData[mark] = { 'H': (float(row['%H']), float(row['HO'])), 'D': (float(row['%D']), float(row['DO'])), 'A': (float(row['%A']), float(row['AO'])) } with newCSVFile( '{}/{}/BackTest.{}.csv'.format(analysisDir, league, model.__class__.__name__), [ 'Date', 'HomeTeam', 'AwayTeam', 'Mark', 'Result', 'MyBet', 'MyOdds', 'Bookie', 'BookieOdds', 'Winnings', 'PnL', 'T_Stk', 'T_W', 'Yield' ]) as backTestWriter: ts = tw = y = 0 for season in seasons: resultsURL = resultsURLTmpl.format(season, league) log.debug('Processing...{}'.format(resultsURL)) with readCSVFileAsDict(resultsURL) as resultsReader: # Assemble results as list so that we can reset the iterator res = list(resultsReader) data = model.processMatches(res) # Resetting the iterator here for row in iter(res): date, ht, at, mark, hForm, aForm = model.markMatch( data, row['Date'], row['HomeTeam'], row['AwayTeam']) if mark is None: continue if mark in rangeMap[league]: bestH = 0 bestD = 0 bestA = 0 bookie = '' try: b365H = float(row['B365H']) b365D = float(row['B365D']) b365A = float(row['B365A']) if b365H > bestH: bestH = b365H bookie = 'B365' except BaseException: log.error('No B365 data - skipping : {} {} {}'\ .format(date, ht, at)) try: bwH = float(row['BWH']) bwD = float(row['BWD']) bwA = float(row['BWA']) if bwH > bestH: bestH = bwH bookie = 'BW' except BaseException: log.error('No BW data - skipping : {} {} {}'\ .format(date, ht, at)) try: iwH = float(row['IWH']) iwD = float(row['IWD']) iwA = float(row['IWA']) if iwH > bestH: bestH = iwH bookie = 'IW' except BaseException: log.error('No IW data - skipping : {} {} {}'\ .format(date, ht, at)) try: lbH = float(row['LBH']) lbD = float(row['LBD']) lbA = float(row['LBA']) if lbH > bestH: bestH = lbH bookie = 'LB' except BaseException: log.error('No LB data - skipping : {} {} {}'\ .format(date, ht, at)) try: whH = float(row['WHH']) whD = float(row['WHD']) whA = float(row['WHA']) if whH > bestH: bestH = whH bookie = 'WH' except BaseException: log.error('No WH data - skipping : {} {} {}'\ .format(date, ht, at)) try: vcH = float(row['VCH']) vcD = float(row['VCD']) vcA = float(row['VCA']) if vcH > bestH: bestH = vcH bookie = 'VC' except BaseException: log.error('No VC data - skipping : {} {} {}'\ .format(date, ht, at)) hSD = summaryData[mark]['H'] aSD = summaryData[mark]['A'] dSD = summaryData[mark]['D'] myBet = '' myOdds = 0.0 myPercent = 0.0 bookieOdds = 0.0 winnings = 0.0 pnl = 0.0 if bestH > hSD[1]: # and bestH < (hSD[1] * 2): myBet = 'H' myOdds = hSD[1] #myOdds = (1.97*mark+45.42)*0.9 myPercent = hSD[0] bookieOdds = bestH winnings = bookieOdds pnl = winnings - 1 if False and myPercent < dSD[0] and bestD > dSD[1]: #if myPercent < dSD[0] and b365D > dSD[1]: myBet = 'D' myOdds = dSD[1] myPercent = dSD[0] bookieOdds = bestD winnings = bookieOdds pnl = winnings - 1 if False and myPercent < aSD[0] and bestA > aSD[1]: #if myPercent < aSD[0] and b365A > aSD[1]: myBet = 'A' myOdds = aSD[1] myPercent = aSD[0] bookieOdds = bestA winnings = bookieOdds pnl = winnings - 1 matchResult = row['FTR'] if myBet != '': if matchResult != myBet: winnings = 0.0 pnl = -1.0 ts += 1 tw += winnings y = (tw - ts) / ts backTestWriter.writerow( (date, ht, at, mark, matchResult, myBet, myOdds, bookie, bookieOdds, winnings, pnl, ts, tw, y)) log.info( '{:<5s} - Staked: GBP{:>6.2f} Won: GBP{:>6.2f} Yield: {:>6.2f}%'. format(league, ts, tw, y * 100))
def makeFootyHistory(resultsURLTmpl, opts=sys.argv): log = Logger() getFootyOptions(log, opts) (algoCfg, mailCfg) = getFootyConfig() rangeMap = algoCfg['rangeMap'] seasons = algoCfg['seasons'] ''' Looks like if you go back too far with the historical data it starts to mess up the results, I suspect this is because the league composition has changed enough to mean that the newer and older season data don't play well together... ''' log.info(__name__ + ' : ' + model.__class__.__name__) for league in rangeMap.keys(): log.info('League : {}...'.format(league)) os.makedirs('{}/{}'.format(analysisDir, league), exist_ok=True) summaryData = {} with newCSVFile('{}/{}/History.{}.csv'.format(analysisDir, league, model.__class__.__name__), ['Date', 'HomeTeam', 'AwayTeam', 'Mark', 'Result']) \ as historyWriter: for season in seasons: resultsURL = resultsURLTmpl.format(season, league) log.debug('Processing...{}'.format(resultsURL)) try: with readCSVFileAsDict(resultsURL) as resultsReader: # Assembling as list so that the iterator can be reset res = list(resultsReader) data = model.processMatches(res) # Resetting iterator here... for row in iter(res): try: date, ht, at, mark, hForm, aForm = \ model.markMatch(data, row['Date'], row['HomeTeam'], row['AwayTeam']) except KeyError: continue if mark is None or row['FTR'] == '': continue mark = int(mark) matchResult = row['FTR'].strip() historyWriter.writerow( [date, ht, at, mark, matchResult]) if mark not in summaryData: summaryData[mark] = {'A': 0, 'D': 0, 'H': 0} summaryData[mark][matchResult] += 1 except BaseException: log.error(sys.exc_info()[0:1]) continue log.info('Writing summary data...') with newCSVFile('{}/{}/Summary.{}.csv'.format(analysisDir, league, model.__class__.__name__), ['Mark', 'Frequency', '%H','HO', '%D', 'DO', '%A', 'AO']) \ as summaryWriter: x = [] hY = [] dY = [] aY = [] hist = {} for mark in summaryData: if mark > 15 or mark < -15: continue awayF = summaryData[mark]['A'] drawF = summaryData[mark]['D'] homeF = summaryData[mark]['H'] totalF = awayF + drawF + homeF awayP = awayF / totalF * 100 drawP = drawF / totalF * 100 homeP = homeF / totalF * 100 x.append(mark) hY.append(homeP) dY.append(drawP) aY.append(awayP) awayO = awayP if awayP == 0 else 100 / awayP drawO = drawP if drawP == 0 else 100 / drawP homeO = homeP if homeP == 0 else 100 / homeP hist[mark] = (homeF, homeP) summaryWriter.writerow([ mark, totalF, '{:>4.2f}'.format(homeP), '{:>4.2f}'.format(homeO), '{:>4.2f}'.format(drawP), '{:>4.2f}'.format(drawO), '{:>4.2f}'.format(awayP), '{:>4.2f}'.format(awayO) ]) s = '' for h in sorted(hist.items(), key=lambda x: x[1][0], reverse=True): s += '{:d} ({:d} {:>5.2f}) '.format(h[0], h[1][0], h[1][1]) log.info(s) with newCSVFile('{}/{}/Stats.{}.csv'.format(analysisDir, league, model.__class__.__name__), ['Result', 'Slope', 'Intercept', 'P', 'R', 'R^2', 'Err']) \ as statsWriter: slope, intercept, r, p, stderr = stats.linregress(x, hY) r2 = r**2 log.info( 'Home: {:>4.2f} {:>4.2f} {:>4.2} {:>4.2f} {:>4.2f} {:>4.2}'. format(slope, intercept, p, r, r2, stderr)) statsWriter.writerow([ 'H', '{:>4.2f}'.format(slope), '{:>4.2f}'.format(intercept), '{:>4.2f}'.format(p), '{:>4.2f}'.format(r), '{:>4.2f}'.format(r2), '{:>4.2f}'.format(stderr) ]) slope, intercept, r, p, stderr = stats.linregress(x, dY) r2 = r**2 log.info( 'Draw: {:>4.2f} {:>4.2f} {:>4.2} {:>4.2f} {:>4.2f} {:>4.2}'. format(slope, intercept, p, r, r2, stderr)) statsWriter.writerow([ 'D', '{:>4.2f}'.format(slope), '{:>4.2f}'.format(intercept), '{:>4.2f}'.format(p), '{:>4.2f}'.format(r), '{:>4.2f}'.format(r2), '{:>4.2f}'.format(stderr) ]) slope, intercept, r, p, stderr = stats.linregress(x, aY) r2 = r**2 log.info( 'Away: {:>4.2f} {:>4.2f} {:>4.2} {:>4.2f} {:>4.2f} {:>4.2}'. format(slope, intercept, p, r, r2, stderr)) statsWriter.writerow([ 'A', '{:>4.2f}'.format(slope), '{:>4.2f}'.format(intercept), '{:>4.2f}'.format(p), '{:>4.2f}'.format(r), '{:>4.2f}'.format(r2), '{:>4.2f}'.format(stderr) ])