Example #1
0
class ScrappOlx:
    def __init__(self):
        self.log = Logger().custom_logger()
        self.db_client = DataBaseClient()
        opts = Options()
        opts.log.level = "fatal"
        self.driver = webdriver.Firefox(executable_path=path_to_driver,
                                        options=opts)
        self.driver.implicitly_wait(60)
        self.wait = WebDriverWait(self.driver, 60)

        self.start_url = start_url

    def parse(self):
        self.driver.get(start_url)
        self.log.start('Pareser started ad {}'.format(start_url))
        self.wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, '//*[@id="homeShowAllAds"]')))
        self.all_caregories = self.driver.find_elements(
            By.XPATH,
            '//div[contains(@class, "subcategories-list")]/div/a[contains(@class, "inlblk")]'
        )
        self.hrefs_to_categorys = (no_blank for no_blank in (
            item.get_attribute('href') for item in self.all_caregories)
                                   if len(no_blank) > 2)
        #возвращаем ссылки на все категории
        return self.hrefs_to_categorys

    def get_info_category(self):
        hrefs = self.parse()
        for item in hrefs:
            self.first_page = True
            self.driver.get(item)
            self.log.info(f'Getting info from category {item}')
            self.max_page = self.driver.find_elements(
                By.XPATH,
                '//span[contains(@class, "item fleft")][last()]')[0].text
            for number in range(1, int(self.max_page) + 1):
                if not self.first_page:
                    self.new_url = item + f'?page={number}'
                    self.driver.get(self.new_url)
                all_records_on_page = self.driver.find_elements(
                    By.XPATH,
                    '//tr[contains(@class, "wrap")]//a[contains(@class, "linkWithHash detailsLink")]'
                )
                href_to_records = [
                    item.get_attribute('href') for item in all_records_on_page
                ]
                self.first_page = False
                href_to_records = list(set(href_to_records))
                self.get_info_record(href_to_records)

    def get_phone_number(self):
        try:
            self.driver.find_element(
                By.XPATH,
                '//div[contains(@id, "cookiesBar")]/button[contains(@class, "cookiesBarClose")]'
            ).click()
        except Exception as err:
            pass
        try:
            self.wait.until(
                EC.element_to_be_clickable((
                    By.XPATH,
                    '//div[contains(@class, "contact-button link-phone")]/strong[contains(@class, "xx-large")]'
                ))).click()
            phone = self.driver.find_element(
                By.XPATH,
                '//div[contains(@class, "contact-button link-phone")]/strong[contains(@class, "xx-large")]'
            ).text
            test = search(r'\d+', phone)
        except (exceptions.TimeoutException,
                exceptions.NoSuchElementException) as no_element:
            test = 1
            self.log.warning(f'No phone on record: {self.driver.current_url}')
            phone = ' '
        except exceptions.StaleElementReferenceException as err:
            self.driver.refresh()
            try:
                self.wait.until(
                    EC.element_to_be_clickable((
                        By.XPATH,
                        '//div[contains(@class, "contact-button link-phone")]/strong[contains(@class, "xx-large")]'
                    ))).click()
                phone = self.driver.find_element(
                    By.XPATH,
                    '//div[contains(@class, "contact-button link-phone")]/strong[contains(@class, "xx-large")]'
                ).text
                test = search(r'\d+', phone)
            except (exceptions.TimeoutException,
                    exceptions.NoSuchElementException) as no_element:
                test = 1
                self.log.warning(
                    f'No phone on record: {self.driver.current_url}')
                phone = ' '
        while not test:
            if phone == 'Показать телефон':
                self.wait.until(
                    EC.element_to_be_clickable((
                        By.XPATH,
                        '//div[contains(@class, "contact-button link-phone")]/strong[contains(@class, "xx-large")]'
                    ))).click()
            phone = self.driver.find_element(
                By.XPATH, '//strong[contains(@class, "xx-large")]').text
            test = search(r'\d+', phone)
        return phone

    def get_info_record(self, hrefs):
        for item in hrefs:
            self.log.info(f'Start parse record\n{item}')
            self.driver.get(item)
            self.wait.until(
                EC.element_to_be_clickable(
                    (By.XPATH, '//span[contains(@class, "link inlblk")]')))
            try:
                no_active = driver.find_element(By.XPATH, '//h3/strong').text
                is_record_active = False
            except:
                is_record_active = True
            if is_record_active:
                info = self.driver.find_elements(
                    By.XPATH, '//a[contains(@class, "link nowrap")]/span')
                city = info[0].text.split(' ')[-1]
                try:
                    record_categoty = f'{info[1].text.replace(city, "")} --> {info[2].text.replace(city, "")}'
                except:
                    record_categoty = f'{info[1].text.replace(city, "")}'
                title = self.driver.find_element(
                    By.XPATH,
                    '//div[contains(@class, "offer-titlebox")]/h1').text
                price = self.driver.find_element(
                    By.XPATH, '//div[contains(@class, "pricelabel")]').text

                description = self.driver.find_element(
                    By.XPATH, '//div[contains(@id, "textContent")]').text

                bottombar_items = self.driver.find_elements(
                    By.XPATH,
                    '//div[contains(@id, "offerbottombar")]/ul/li//strong')
                date_publish = bottombar_items[0].text.replace('в', '')
                views = bottombar_items[1].text
                number_record = bottombar_items[2].text
                name_user = self.driver.find_element(
                    By.XPATH,
                    '//div[contains(@class, "offer-user__actions")]/h4').text
                phone = self.get_phone_number()
                try:
                    image_href = self.driver.find_element(
                        By.XPATH,
                        '//div[contains(@id, "descImage")]/img').get_attribute(
                            'src')
                except Exception as err:
                    self.log.warning(f'Can not get image href: {err.args}')
                record_url = self.driver.current_url
                try:
                    record = ModelRecord(number_record=number_record,
                                         record_categoty=record_categoty,
                                         title=title,
                                         price=price,
                                         description=description,
                                         date_publish=date_publish,
                                         views=views,
                                         name_user=name_user,
                                         phone=phone,
                                         image_href=image_href,
                                         record_url=record_url)
                    self.db_client.session.merge(record)
                    self.db_client.session.commit()
                    self.log.info(f'Record {number_record} added to DB')
                except Exception as err:
                    self.log.error(
                        'Record {number_record} nont added to DB {err.args}')

    def __del__(self):
        self.driver.clsoe()
        self.log.info('Scrapping end')
Example #2
0
def footyBackTest(resultsURLTmpl, opts=sys.argv):
    (algoCfg, mailCfg) = getFootyConfig()
    rangeMap = algoCfg['rangeMap']
    seasons = algoCfg['seasons']

    log = Logger()
    (sm, rm) = getFootyOptions(log, opts)
    rangeMap = rm if rm else rangeMap

    for league in rangeMap.keys():
        summaryData = {}
        with readCSVFileAsDict('{}/{}/Summary.{}.csv'.format(
                analysisDir, league,
                model.__class__.__name__)) as summaryReader:
            for row in summaryReader:
                mark = int(row['Mark'])
                summaryData[mark] = {
                    'H': (float(row['%H']), float(row['HO'])),
                    'D': (float(row['%D']), float(row['DO'])),
                    'A': (float(row['%A']), float(row['AO']))
                }
        with newCSVFile(
                '{}/{}/BackTest.{}.csv'.format(analysisDir, league,
                                               model.__class__.__name__),
            [
                'Date', 'HomeTeam', 'AwayTeam', 'Mark', 'Result', 'MyBet',
                'MyOdds', 'Bookie', 'BookieOdds', 'Winnings', 'PnL', 'T_Stk',
                'T_W', 'Yield'
            ]) as backTestWriter:
            ts = tw = y = 0
            for season in seasons:
                resultsURL = resultsURLTmpl.format(season, league)
                log.debug('Processing...{}'.format(resultsURL))
                with readCSVFileAsDict(resultsURL) as resultsReader:
                    # Assemble results as list so that we can reset the iterator
                    res = list(resultsReader)
                    data = model.processMatches(res)
                    # Resetting the iterator here
                    for row in iter(res):
                        date, ht, at, mark, hForm, aForm = model.markMatch(
                            data, row['Date'], row['HomeTeam'],
                            row['AwayTeam'])
                        if mark is None:
                            continue

                        if mark in rangeMap[league]:
                            bestH = 0
                            bestD = 0
                            bestA = 0
                            bookie = ''
                            try:
                                b365H = float(row['B365H'])
                                b365D = float(row['B365D'])
                                b365A = float(row['B365A'])
                                if b365H > bestH:
                                    bestH = b365H
                                    bookie = 'B365'
                            except BaseException:
                                log.error('No B365 data - skipping : {} {} {}'\
                                        .format(date, ht, at))
                            try:
                                bwH = float(row['BWH'])
                                bwD = float(row['BWD'])
                                bwA = float(row['BWA'])
                                if bwH > bestH:
                                    bestH = bwH
                                    bookie = 'BW'
                            except BaseException:
                                log.error('No BW data - skipping : {} {} {}'\
                                        .format(date, ht, at))
                            try:
                                iwH = float(row['IWH'])
                                iwD = float(row['IWD'])
                                iwA = float(row['IWA'])
                                if iwH > bestH:
                                    bestH = iwH
                                    bookie = 'IW'
                            except BaseException:
                                log.error('No IW data - skipping : {} {} {}'\
                                        .format(date, ht, at))
                            try:
                                lbH = float(row['LBH'])
                                lbD = float(row['LBD'])
                                lbA = float(row['LBA'])
                                if lbH > bestH:
                                    bestH = lbH
                                    bookie = 'LB'
                            except BaseException:
                                log.error('No LB data - skipping : {} {} {}'\
                                        .format(date, ht, at))
                            try:
                                whH = float(row['WHH'])
                                whD = float(row['WHD'])
                                whA = float(row['WHA'])
                                if whH > bestH:
                                    bestH = whH
                                    bookie = 'WH'
                            except BaseException:
                                log.error('No WH data - skipping : {} {} {}'\
                                        .format(date, ht, at))
                            try:
                                vcH = float(row['VCH'])
                                vcD = float(row['VCD'])
                                vcA = float(row['VCA'])
                                if vcH > bestH:
                                    bestH = vcH
                                    bookie = 'VC'
                            except BaseException:
                                log.error('No VC data - skipping : {} {} {}'\
                                        .format(date, ht, at))

                            hSD = summaryData[mark]['H']
                            aSD = summaryData[mark]['A']
                            dSD = summaryData[mark]['D']

                            myBet = ''
                            myOdds = 0.0
                            myPercent = 0.0
                            bookieOdds = 0.0
                            winnings = 0.0
                            pnl = 0.0

                            if bestH > hSD[1]:  # and bestH < (hSD[1] * 2):
                                myBet = 'H'
                                myOdds = hSD[1]
                                #myOdds = (1.97*mark+45.42)*0.9
                                myPercent = hSD[0]
                                bookieOdds = bestH
                                winnings = bookieOdds
                                pnl = winnings - 1

                            if False and myPercent < dSD[0] and bestD > dSD[1]:
                                #if myPercent < dSD[0] and b365D > dSD[1]:
                                myBet = 'D'
                                myOdds = dSD[1]
                                myPercent = dSD[0]
                                bookieOdds = bestD
                                winnings = bookieOdds
                                pnl = winnings - 1

                            if False and myPercent < aSD[0] and bestA > aSD[1]:
                                #if myPercent < aSD[0] and b365A > aSD[1]:
                                myBet = 'A'
                                myOdds = aSD[1]
                                myPercent = aSD[0]
                                bookieOdds = bestA
                                winnings = bookieOdds
                                pnl = winnings - 1

                            matchResult = row['FTR']
                            if myBet != '':
                                if matchResult != myBet:
                                    winnings = 0.0
                                    pnl = -1.0
                                ts += 1
                                tw += winnings
                                y = (tw - ts) / ts

                            backTestWriter.writerow(
                                (date, ht, at, mark, matchResult, myBet,
                                 myOdds, bookie, bookieOdds, winnings, pnl, ts,
                                 tw, y))

        log.info(
            '{:<5s} - Staked: GBP{:>6.2f} Won: GBP{:>6.2f} Yield: {:>6.2f}%'.
            format(league, ts, tw, y * 100))
Example #3
0
def makeFootyHistory(resultsURLTmpl, opts=sys.argv):
    log = Logger()
    getFootyOptions(log, opts)

    (algoCfg, mailCfg) = getFootyConfig()
    rangeMap = algoCfg['rangeMap']
    seasons = algoCfg['seasons']
    '''
    Looks like if you go back too far with the historical data it starts to 
    mess up the results, I suspect this is because the league composition has 
    changed enough to mean that the newer and older season data don't play 
    well together...
    '''
    log.info(__name__ + ' : ' + model.__class__.__name__)
    for league in rangeMap.keys():
        log.info('League : {}...'.format(league))
        os.makedirs('{}/{}'.format(analysisDir, league), exist_ok=True)
        summaryData = {}
        with newCSVFile('{}/{}/History.{}.csv'.format(analysisDir, league,
                    model.__class__.__name__),
                    ['Date', 'HomeTeam', 'AwayTeam', 'Mark', 'Result']) \
                        as historyWriter:
            for season in seasons:
                resultsURL = resultsURLTmpl.format(season, league)
                log.debug('Processing...{}'.format(resultsURL))
                try:
                    with readCSVFileAsDict(resultsURL) as resultsReader:
                        # Assembling as list so that the iterator can be reset
                        res = list(resultsReader)
                        data = model.processMatches(res)
                        # Resetting iterator here...
                        for row in iter(res):
                            try:
                                date, ht, at, mark, hForm, aForm = \
                                        model.markMatch(data,
                                                row['Date'],
                                                row['HomeTeam'],
                                                row['AwayTeam'])
                            except KeyError:
                                continue
                            if mark is None or row['FTR'] == '':
                                continue
                            mark = int(mark)
                            matchResult = row['FTR'].strip()
                            historyWriter.writerow(
                                [date, ht, at, mark, matchResult])

                            if mark not in summaryData:
                                summaryData[mark] = {'A': 0, 'D': 0, 'H': 0}
                            summaryData[mark][matchResult] += 1
                except BaseException:
                    log.error(sys.exc_info()[0:1])
                    continue

        log.info('Writing summary data...')

        with newCSVFile('{}/{}/Summary.{}.csv'.format(analysisDir, league,
                    model.__class__.__name__),
                    ['Mark', 'Frequency', '%H','HO', '%D', 'DO', '%A', 'AO']) \
                        as summaryWriter:
            x = []
            hY = []
            dY = []
            aY = []
            hist = {}
            for mark in summaryData:
                if mark > 15 or mark < -15:
                    continue
                awayF = summaryData[mark]['A']
                drawF = summaryData[mark]['D']
                homeF = summaryData[mark]['H']

                totalF = awayF + drawF + homeF
                awayP = awayF / totalF * 100
                drawP = drawF / totalF * 100
                homeP = homeF / totalF * 100

                x.append(mark)
                hY.append(homeP)
                dY.append(drawP)
                aY.append(awayP)

                awayO = awayP if awayP == 0 else 100 / awayP
                drawO = drawP if drawP == 0 else 100 / drawP
                homeO = homeP if homeP == 0 else 100 / homeP

                hist[mark] = (homeF, homeP)
                summaryWriter.writerow([
                    mark, totalF, '{:>4.2f}'.format(homeP),
                    '{:>4.2f}'.format(homeO), '{:>4.2f}'.format(drawP),
                    '{:>4.2f}'.format(drawO), '{:>4.2f}'.format(awayP),
                    '{:>4.2f}'.format(awayO)
                ])

        s = ''
        for h in sorted(hist.items(), key=lambda x: x[1][0], reverse=True):
            s += '{:d} ({:d} {:>5.2f}) '.format(h[0], h[1][0], h[1][1])
        log.info(s)

        with newCSVFile('{}/{}/Stats.{}.csv'.format(analysisDir, league,
                    model.__class__.__name__),
                    ['Result', 'Slope', 'Intercept', 'P', 'R', 'R^2', 'Err']) \
                        as statsWriter:
            slope, intercept, r, p, stderr = stats.linregress(x, hY)
            r2 = r**2
            log.info(
                'Home: {:>4.2f} {:>4.2f} {:>4.2} {:>4.2f} {:>4.2f} {:>4.2}'.
                format(slope, intercept, p, r, r2, stderr))
            statsWriter.writerow([
                'H', '{:>4.2f}'.format(slope), '{:>4.2f}'.format(intercept),
                '{:>4.2f}'.format(p), '{:>4.2f}'.format(r),
                '{:>4.2f}'.format(r2), '{:>4.2f}'.format(stderr)
            ])

            slope, intercept, r, p, stderr = stats.linregress(x, dY)
            r2 = r**2
            log.info(
                'Draw: {:>4.2f} {:>4.2f} {:>4.2} {:>4.2f} {:>4.2f} {:>4.2}'.
                format(slope, intercept, p, r, r2, stderr))
            statsWriter.writerow([
                'D', '{:>4.2f}'.format(slope), '{:>4.2f}'.format(intercept),
                '{:>4.2f}'.format(p), '{:>4.2f}'.format(r),
                '{:>4.2f}'.format(r2), '{:>4.2f}'.format(stderr)
            ])

            slope, intercept, r, p, stderr = stats.linregress(x, aY)
            r2 = r**2
            log.info(
                'Away: {:>4.2f} {:>4.2f} {:>4.2} {:>4.2f} {:>4.2f} {:>4.2}'.
                format(slope, intercept, p, r, r2, stderr))
            statsWriter.writerow([
                'A', '{:>4.2f}'.format(slope), '{:>4.2f}'.format(intercept),
                '{:>4.2f}'.format(p), '{:>4.2f}'.format(r),
                '{:>4.2f}'.format(r2), '{:>4.2f}'.format(stderr)
            ])