def cache_html(url, name, attempts=1): # proxies = { # 'http': 'socks5://127.0.0.1:9050', # } if attempts > MAX_GET_ATTEMPTS: logger.critical(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}') raise TimeoutError(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}') logger.info(f'GET: {url}') if attempts > 1: logger.info(f'attempt: {attempts}') site = requests.get(url, headers=HEADERS()) site.encoding = 'utf-8' if is_captcha(site.content): logger.warning(f'Captcha received for url: {url}') logger.warning(f'sleeping for {TIMEOUT_SEC * attempts}s...') sleep(TIMEOUT_SEC * attempts) return cache_html(url, name, attempts=attempts + 1) try: with open(Path(CACHED_FOLDER, name), 'wb') as out: out.write(site.content) except FileNotFoundError: import os os.mkdir(CACHED_FOLDER) with open(Path(CACHED_FOLDER, name), 'wb') as out: out.write(site.content) logger.info(f'Cache name: {name}') return site.content
def get_fixtures_for_date(d=None, overwrite=False): if d is None: params = {'d': datetime.strftime(datetime.utcnow(), '%Y%m%d')} elif type(d) is datetime: params = {'d': datetime.strftime(d, '%Y%m%d')} elif type(d) in [str, int]: params = {'d': d} else: print('Unknown date type') return False page = SITE+'/LiveScores/' r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: wait() return False model_last_mode = re.findall("'Model-Last-Mode': '([^']+)'", r.text)[0] headers = HEADERS.copy() headers['Model-Last-Mode'] = model_last_mode headers['Referer'] = r.url headers['X-Requested-With'] = 'XMLHttpRequest' print(model_last_mode) wait() page = SITE+'/matchesfeed/' r = requests.get(page, params=params, headers=HEADERS, allow_redirects=False) print(r.url, r.status_code) print(r.text) if r.status_code != 200: wait() return False matchData = re.sub(r'([,[])(?=[,\]])', r'\1null', r.text) data = json.loads(matchData.replace("'", '"')) print(data) stageData = data[1] matchData = data[2]
def get_fixtures(stage_id, overwrite=False): if matchheaders.find_one({'stageId': stage_id}) and not overwrite: print('Matches already exist') return True stage = stages.find_one({'stageId': stage_id}) page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}/Seasons/{seasonId}/Stages/{stageId}/Fixtures'.format(**stage) r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: wait() return False model_last_mode = re.findall("'Model-Last-Mode': '([^']+)'", r.text)[0] headers = HEADERS.copy() headers['Model-Last-Mode'] = model_last_mode headers['Referer'] = r.url headers['X-Requested-With'] = 'XMLHttpRequest' dates = re.findall("'Month', ([^ ]+), min, max", r.text) if dates: dates = re.sub(r'(\d+)(?=:)', r'"\1"', dates[0]) d = json.loads(dates) if len(d) == 0: print('No matches') wait() return False months = {format(d): format(d+1, '02') for d in range(0, 12)} params = {'isAggregate': 'false'} for y in d: for m in d[y]: params['d'] = '{0}{1}'.format(y, months[m]) wait() page = SITE+'/tournamentsfeed/{0}/Fixtures/'.format(stage_id) r = requests.get(page, params=params, headers=headers, allow_redirects=False) print(r.url, r.status_code) if r.status_code != 200: wait() return False matchData = re.sub(r',(?=,)', r',null', r.text) data = json.loads(matchData.replace("'", '"')) for row in data: match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3], 'home': {'teamId': row[4], 'name': row[5], 'field': 'home'}, 'away': {'teamId': row[7], 'name': row[8], 'field': 'away'}, 'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16], 'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19], } if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite: print('Match already exists') else: match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y') match['startTime'] = datetime.strptime(match['startTime'], '%H:%M') match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time()) for k, v in stage.items(): if 'Id' in k: match[k] = v matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True) else: matchData = re.findall("calendarParameter\), ([^;]*)\);", r.text) matchData = re.sub(r',(?=,)', r',null', matchData[0]) data = json.loads(matchData.replace("'", '"') if matchData else '{}') for row in data: match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3], 'home': {'teamId': row[4], 'name': row[5], 'field': 'home'}, 'away': {'teamId': row[7], 'name': row[8], 'field': 'away'}, 'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16], 'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19], } if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite: print('Match already exists') else: match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y') match['startTime'] = datetime.strptime(match['startTime'], '%H:%M') match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time()) for k, v in stage.items(): if 'Id' in k: match[k] = v matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True) wait()
async def fetch(session, url): async with session.get(url, headers=HEADERS()) as response: return await response.text()