Beispiel #1
0
def cache_html(url, name, attempts=1):
    # proxies = {
    # 	'http': 'socks5://127.0.0.1:9050',
    # }

    if attempts > MAX_GET_ATTEMPTS:
        logger.critical(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}')
        raise TimeoutError(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}')
    logger.info(f'GET: {url}')
    if attempts > 1:
        logger.info(f'attempt: {attempts}')

    site = requests.get(url, headers=HEADERS())
    site.encoding = 'utf-8'

    if is_captcha(site.content):
        logger.warning(f'Captcha received for url: {url}')
        logger.warning(f'sleeping for {TIMEOUT_SEC * attempts}s...')
        sleep(TIMEOUT_SEC * attempts)
        return cache_html(url, name, attempts=attempts + 1)

    try:
        with open(Path(CACHED_FOLDER, name), 'wb') as out:
            out.write(site.content)
    except FileNotFoundError:
        import os
        os.mkdir(CACHED_FOLDER)
        with open(Path(CACHED_FOLDER, name), 'wb') as out:
            out.write(site.content)
    logger.info(f'Cache name: {name}')
    return site.content
Beispiel #2
0
def get_fixtures_for_date(d=None, overwrite=False):
    if d is None:
        params = {'d': datetime.strftime(datetime.utcnow(), '%Y%m%d')}
    elif type(d) is datetime:
        params = {'d': datetime.strftime(d, '%Y%m%d')}
    elif type(d) in [str, int]:
        params = {'d': d}
    else:
        print('Unknown date type')
        return False

    page = SITE+'/LiveScores/'
    r = requests.get(page, headers=HEADERS)
    print(r.url)

    if r.status_code != 200:
        wait()
        return False

    model_last_mode = re.findall("'Model-Last-Mode': '([^']+)'", r.text)[0]
    headers = HEADERS.copy()
    headers['Model-Last-Mode'] = model_last_mode
    headers['Referer'] = r.url
    headers['X-Requested-With'] = 'XMLHttpRequest'
    print(model_last_mode)
    wait()

    page = SITE+'/matchesfeed/'
    r = requests.get(page, params=params, headers=HEADERS, allow_redirects=False)
    print(r.url, r.status_code)
    print(r.text)

    if r.status_code != 200:
        wait()
        return False

    matchData = re.sub(r'([,[])(?=[,\]])', r'\1null', r.text)
    data = json.loads(matchData.replace("'", '"'))
    print(data)

    stageData = data[1]
    matchData = data[2]
Beispiel #3
0
def get_fixtures(stage_id, overwrite=False):
    if matchheaders.find_one({'stageId': stage_id}) and not overwrite:
        print('Matches already exist')
        return True

    stage = stages.find_one({'stageId': stage_id})
    page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}/Seasons/{seasonId}/Stages/{stageId}/Fixtures'.format(**stage)
    r = requests.get(page, headers=HEADERS)
    print(r.url)

    if r.status_code != 200:
        wait()
        return False

    model_last_mode = re.findall("'Model-Last-Mode': '([^']+)'", r.text)[0]
    headers = HEADERS.copy()
    headers['Model-Last-Mode'] = model_last_mode
    headers['Referer'] = r.url
    headers['X-Requested-With'] = 'XMLHttpRequest'

    dates = re.findall("'Month', ([^ ]+), min, max", r.text)
    if dates:
        dates = re.sub(r'(\d+)(?=:)', r'"\1"', dates[0])
        d = json.loads(dates)

        if len(d) == 0:
            print('No matches')
            wait()
            return False

        months = {format(d): format(d+1, '02') for d in range(0, 12)}
        params = {'isAggregate': 'false'}

        for y in d:
            for m in d[y]:
                params['d'] = '{0}{1}'.format(y, months[m])
                wait()

                page = SITE+'/tournamentsfeed/{0}/Fixtures/'.format(stage_id)
                r = requests.get(page, params=params, headers=headers, allow_redirects=False)
                print(r.url, r.status_code)

                if r.status_code != 200:
                    wait()
                    return False

                matchData = re.sub(r',(?=,)', r',null', r.text)
                data = json.loads(matchData.replace("'", '"'))

                for row in data:
                    match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3],
                             'home': {'teamId': row[4], 'name': row[5], 'field': 'home'},
                             'away': {'teamId': row[7], 'name': row[8], 'field': 'away'},
                             'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16],
                             'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19],
                             }

                    if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite:
                        print('Match already exists')
                    else:
                        match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y')
                        match['startTime'] = datetime.strptime(match['startTime'], '%H:%M')
                        match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time())
                        for k, v in stage.items():
                            if 'Id' in k:
                                match[k] = v

                        matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True)
    else:
        matchData = re.findall("calendarParameter\), ([^;]*)\);", r.text)
        matchData = re.sub(r',(?=,)', r',null', matchData[0])
        data = json.loads(matchData.replace("'", '"') if matchData else '{}')

        for row in data:
            match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3],
                     'home': {'teamId': row[4], 'name': row[5], 'field': 'home'},
                     'away': {'teamId': row[7], 'name': row[8], 'field': 'away'},
                     'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16],
                     'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19],
                     }

            if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite:
                print('Match already exists')
            else:
                match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y')
                match['startTime'] = datetime.strptime(match['startTime'], '%H:%M')
                match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time())
                for k, v in stage.items():
                    if 'Id' in k:
                        match[k] = v

                matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True)
    wait()
Beispiel #4
0
async def fetch(session, url):
    async with session.get(url, headers=HEADERS()) as response:
        return await response.text()