def _parse_file(file_path): m = re.search(r'(\d{4}-\d{2}-\d{2})\.html$', file_path) grab_date_str = m.group(1) grab_date = datetime.datetime.strptime(grab_date_str, '%Y-%m-%d').date() tomorrow_grab_date = grab_date + datetime.timedelta(days=1) with open(file_path, 'rt', encoding='utf-8') as f_in: for raw_match_data in handle(f_in): if not _is_betcity_tournament_name_valid( raw_match_data['tournament']): if raw_match_data['tournament'].startswith('Футбол.'): _unknown_tournaments.add(raw_match_data['tournament']) continue if 'Статистика игрового дня' in raw_match_data[ 'tournament'] or 'Статистика турнира' in raw_match_data[ 'tournament']: continue if not is_value_valid(teams_data, 'betcityName', raw_match_data['home']): _unknown_teams.add(raw_match_data['home']) if not is_value_valid(teams_data, 'betcityName', raw_match_data['away']): _unknown_teams.add(raw_match_data['away']) match_date = datetime.datetime.strptime(raw_match_data['date'], '%d.%m.%Y').date() if match_date != grab_date and match_date != tomorrow_grab_date: continue match_date_str = match_date.strftime('%Y-%m-%d') betcity_match_uuid = get_identifier() match_data = { 'uuid': betcity_match_uuid, 'tournament': raw_match_data['tournament'], 'date': match_date_str, 'time': raw_match_data['time'], 'home': raw_match_data['home'], 'away': raw_match_data['away'], 'specialWord': raw_match_data['special_word'], 'bets': raw_match_data['bets'] } out_dir_path = os.path.join('tmp', 'update', 'betcity', 'matchesJson', match_date_str) os.makedirs(out_dir_path, exist_ok=True) out_file_path = os.path.join(out_dir_path, '%s.json' % (betcity_match_uuid, )) with open(out_file_path, 'wt', encoding='utf-8') as f_out: json.dump(match_data, f_out, ensure_ascii=False)
def _parse_file(file_path): with open(file_path, 'rt', encoding='utf-8') as f: data = handle_date(f) for item in data: (intelbet_country, intelbet_tournament, intelbet_home, intelbet_away, url, match_time_str) = item if not ( is_value_valid(countries_data, 'intelbetCountryName', intelbet_country) and \ is_value_valid(tournaments_data, 'intelbetTournamentName', intelbet_tournament) ): continue print(url) match_names_automatically(url)
def _parse_file(file_path): m = re.search(r'(\d{4}-\d{2}-\d{2})\.html$', file_path) date_str = m.group(1) with open(file_path, 'rt', encoding='utf-8') as f: data = handle_date(f) for item in data: (intelbet_country, intelbet_tournament, intelbet_home, intelbet_away, url, match_time_str) = item if not ( is_value_valid(countries_data, 'intelbetCountryName', intelbet_country) and \ is_value_valid(tournaments_data, 'intelbetTournamentName', intelbet_tournament) ): continue intelbet_match_uuid = get_identifier() intelbet_match_header = { 'uuid': intelbet_match_uuid, 'date': date_str, 'home': intelbet_home, 'away': intelbet_away, 'contry': intelbet_country, 'tournament': intelbet_tournament, 'url': url, 'time': match_time_str } print(url) match_html = intelbet_get(url, delay=0.5) out_dir_path = os.path.join('tmp', 'update', 'intelbet', 'matchesHtml', date_str) os.makedirs(out_dir_path, exist_ok=True) header_out_file_path = os.path.join( out_dir_path, '%s.json' % (intelbet_match_uuid, )) with open(header_out_file_path, 'wt', encoding='utf-8') as header_f_out: json.dump(intelbet_match_header, header_f_out, ensure_ascii=False) out_file_path = os.path.join(out_dir_path, '%s.html' % (intelbet_match_uuid, )) with open(out_file_path, 'wt', encoding='utf-8') as f_out: f_out.write(match_html)
def _is_betcity_tournament_name_valid(betcity_tournament_name): possible_tournament_names = _get_possible_tournament_names( betcity_tournament_name) for possible_tournament_name in possible_tournament_names: if is_value_valid(tournaments_data, 'betcityTournamentName', possible_tournament_name): return True return False
def _parse_file(file_path): with open(file_path, 'rt', encoding='utf-8') as f: data = json.load(f) m = re.search('(\d{4}-\d{2}-\d{2})\.json$', file_path) if m is None: raise RuntimeError('invalid file name') match_date_str = m.group(1) (main_data, raw_tournaments_data, raw_matches_data) = data stages_data = {} for raw_stage_data in raw_tournaments_data: if not is_value_valid(tournaments_data, 'whoscoredTournamentId', raw_stage_data[4]): continue stage_id = raw_stage_data[0] stages_data[stage_id] = { 'region_id': int(raw_stage_data[1]), 'region_name': raw_stage_data[3], 'tournament_id': raw_stage_data[4], 'tournament_name': raw_stage_data[7], 'season_id': raw_stage_data[6], 'stage_id': raw_stage_data[0], } for raw_match_data in raw_matches_data: stage_id = raw_match_data[0] if stage_id not in stages_data: continue stage_data = stages_data[stage_id] if not is_value_valid(teams_data, 'whoscoredId', raw_match_data[4]) or not is_value_valid( teams_data, 'whoscoredName', raw_match_data[5]): _unknown_teams.add(raw_match_data[4]) whoscored_match_uuid = get_identifier() whoscored_match_id = raw_match_data[1] whoscored_header = { 'uuid': whoscored_match_uuid, 'matchId': whoscored_match_id, 'date': match_date_str, 'home': raw_match_data[5], 'homeId': raw_match_data[4], 'away': raw_match_data[9], 'awayId': raw_match_data[8], 'regionId': stage_data['region_id'], 'tournamentId': stage_data['tournament_id'], 'seasonId': stage_data['season_id'], 'stageId': stage_data['stage_id'] } # WARNNING: Бывают и другие страницы url = 'https://www.whoscored.com/Matches/%d/Live' % ( whoscored_match_id, ) print(url) match_html = whoscored_get(url, delay=0.5) out_dir_path = os.path.join('tmp', 'update', 'whoscored', 'matchesHtml', match_date_str) os.makedirs(out_dir_path, exist_ok=True) whoscored_header_out_file_path = os.path.join( out_dir_path, '%s.json' % (whoscored_match_uuid, )) with open(whoscored_header_out_file_path, 'wt', encoding='utf-8') as whoscored_header_f_out: json.dump(whoscored_header, whoscored_header_f_out, ensure_ascii=False) html_out_file_path = os.path.join(out_dir_path, '%s.html' % (whoscored_match_uuid, )) with open(html_out_file_path, 'wt', encoding='utf-8') as html_f_out: html_f_out.write(match_html)