def get_bets_from_table(element, home, away): bets = [] trs = element.find('tbody', recursive=False).find_all('tr', recursive=False) if 'n' in trs[0].find('td', recursive=False).get('class'): type_ = remove_colon_and_dash( get_tag_text(trs[0].find('td', recursive=False))) s = 1 else: type_ = None s = 0 name_tds = trs[s].find_all('td', recursive=False)[1:] names = [get_tag_text(name_td) for name_td in name_tds] for i in range(s + 1, len(trs)): tds = trs[i].find_all('td', recursive=False) if len(tds) == 0: continue subtype = remove_colon_and_dash(get_tag_text(tds[0])) table_data = [(names[j - 1], get_tag_text(tds[j])) for j in range(1, len(tds))] bets += handle_table_data(table_data, type_=type_, subtype=subtype, home=home, away=away) return bets
def get_bets_from_line(element, home, away): bets = [] bet_elements = list(element.descendants) type_ = remove_colon_and_dash(get_tag_text(bet_elements[0])) bet_blocks = [] bet_block = ['', '', None] i = 2 # 0: <b>, 1: <b>.contents[0], 2: <b>.next_sibling while i < len(bet_elements): current = bet_elements[i] if current.name == 'b': bet_blocks.append(bet_block) bet_block = ['', '', None] bet_block[0] = remove_colon_and_dash(get_tag_text(current)) (bet_block[2], bet_block[0]) = get_and_remove_special_word(bet_block[0]) end = current.next_sibling while i < len(bet_elements) and bet_elements[i] != end: i += 1 else: if isinstance(current, bs4.element.NavigableString): bet_block[1] += ' ' + str(current) i += 1 bet_blocks.append(bet_block) for (prefix, bet_str, bet_special_word) in bet_blocks: bet_str = bet_str.strip() if len(bet_str) == 0: continue m1 = re.search(r'^(?:-|:)?\s*(?:\(\s*(.+?)\s*\)\s+)?(.+)$', bet_str) if m1 is None: continue handicap = float_safe(re.sub( r'\s*', '', m1.group(1))) if m1.group(1) is not None else None unhandicaped = m1.group(2) for part in re.split(r'\s*;', unhandicaped): part = part.strip() if len(part) == 0: continue m2 = re.search(r'^(?:(.+?)(?:\s*[:-])?\s+)?(\S+)$', part) if m2 is None: continue name = m2.group(1) if m2.group(1) is not None else '' value = float_safe(m2.group(2)) bet = [bet_special_word, type_, prefix, name, handicap, value] bets.append(bet) return bets
def handle_bets(elements, home, away): bets = [] for element in elements: try: if element.name == 'div': if len(element.contents ) >= 3 and element.contents[2].name == 'table': type_ = remove_colon_and_dash( get_tag_text(element.contents[1])) bets += get_bets_from_table(element.contents[2], home=home, away=away) else: bets += get_bets_from_line(element, home=home, away=away) elif element.name == 'table': bets += get_bets_from_table(element, home=home, away=away) else: continue except Exception: continue bets = [bet for bet in bets if bet[5] is not None] return bets
def get_bets_from_table(element, home, away): bets = [] thead_trs = element.find('thead', recursive=False).find_all('tr', recursive=False) type_ = remove_colon_and_dash( get_tag_text( thead_trs[0] ) ) name_tds = thead_trs[2].find_all('td', recursive=False)[1:] names = [ get_tag_text(name_td) for name_td in name_tds ] tbody_trs = element.find_all('tbody', recursive=False)[1].find_all('tr', recursive=False) for tr in tbody_trs: tds = tr.find_all('td', recursive=False) if len(tds) == 0: continue subtype = remove_colon_and_dash( get_tag_text(tds[0]) ) table_data = [ (names[j-1], get_tag_text(tds[j])) for j in range(1, len(tds)) ] bets += handle_table_data(table_data, type_=type_, subtype=subtype, home=home, away=away) return bets
def _extract_player_names(table_tag): player_names = [] trs = table_tag.find('tbody').find_all('tr', recursive=False) for tr in trs: tds = tr.find_all('td', recursive=False) player_name_td = tds[1] player_name = get_tag_text(player_name_td) player_names.append(player_name) return player_names
def handle_date(html_or_file): data = [] soup = bs4.BeautifulSoup(html_or_file, 'lxml') tables = soup.find_all('table', class_='meeting-odds') for table in tables: country_and_tournament_th = table.find('th', class_='tournament-name', recursive=True) links = country_and_tournament_th.find_all('a', recursive=True) if len(links) > 1: intelbet_country = get_tag_text(links[0]) intelbet_tournament = get_tag_text(links[1]) else: intelbet_country = None intelbet_tournament = get_tag_text(links[0]) trs = table.find('tbody').find_all('tr', recursive=False) for tr in trs: teams_tags = tr.find('td', class_='name-with-icon', recursive=False).find('a', recursive=False).find('span', recursive=False).find_all('span', recursive=False) intelbet_home = get_tag_text(teams_tags[0]) intelbet_away = get_tag_text(teams_tags[2]) url_tag = tr.find('td', class_='name-with-icon', recursive=False).find('a', recursive=False) url = 'http:%s' % (url_tag['href'],) match_time_tag = tr.find('td', class_='tiles-bet-time', recursive=False) match_time_str = get_tag_text(match_time_tag) item = (intelbet_country, intelbet_tournament, intelbet_home, intelbet_away, url, match_time_str) data.append(item) return data
def handle_tournament(tournament_table): raw_matches_data = [] tournament_table_tbodies = tournament_table.find_all('tbody', recursive=False) tournament_name_tbody = tournament_table_tbodies[0] tournament_name = get_tag_text( tournament_name_tbody.find('tr', recursive=False).find('td', recursive=False) ) tournament_main_tbody = tournament_table_tbodies[1] bets = None trs = tournament_main_tbody.find_all('tr', recursive=False) for tr in trs: if 'ng-hide' in tr.get('class', []): continue if 't_comment' in tr.find('td', recursive=False).get('class', []): continue if tr.find('td', recursive=False).get('colspan') == '13': if bets is not None: raw_match_data = { 'tournament': tournament_name, 'date': match_date_str, 'time': time, 'home': home, 'away': away, 'special_word': special_word, 'bets': bets } raw_matches_data.append(raw_match_data) match_date_str = get_tag_text( tr.find('td', recursive=False) ) if 'th' in tr.get('class', []): main_data_name_tds = tr.find_all('td', recursive=False) main_data_names = [ get_tag_text(main_data_name_td) for main_data_name_td in main_data_name_tds ] if 'tc' in tr.get('class', []) or 'tc1' in tr.get('class', []): if bets is not None: raw_match_data = { 'tournament': tournament_name, 'date': match_date_str, 'time': time, 'home': home, 'away': away, 'special_word': special_word, 'bets': bets } raw_matches_data.append(raw_match_data) main_data_tds = tr.find_all('td', recursive=False) main_data = [ (main_data_names[i], get_tag_text(main_data_tds[i])) for i in range(len(main_data_names)) ] try: (time, home, away, special_word, additional, main_data_bets) = handle_main_data(main_data) except Exception: return raw_matches_data bets = [] bets += main_data_bets elif 'tcd' in tr.get('class', []) or 'tcd1' in tr.get('class', []): divs_and_tables = tr.find('td').find('div', class_='extTbl').find('div').contents try: for element in divs_and_tables: if element.name == 'div': bets += handle_bets(element.contents, home=home, away=away) elif element.name == 'table': bets += handle_bets([ element ], home=home, away=away) else: continue except Exception: pass else: continue if bets is not None: raw_match_data = { 'tournament': tournament_name, 'date': match_date_str, 'time': time, 'home': home, 'away': away, 'special_word': special_word, 'bets': bets } raw_matches_data.append(raw_match_data) return raw_matches_data
def handle_tournament_day(tournament_table): raw_matches_data = [] tournament_day_thead = tournament_table.find('thead', recursive=False) tournament_name = get_tag_text( tournament_day_thead.find('tr', recursive=False).find('td', recursive=False)) tournament_date_tbody = tournament_table.find('tbody', class_='date', recursive=False) tournament_date = get_tag_text( tournament_date_tbody.find('tr', recursive=False).find('td', recursive=False)) main_data_tds = tournament_table.find( 'tbody', class_='chead', recursive=False).find('tr', class_='th', recursive=False).find_all('td', recursive=False) main_data_names = [ get_tag_text(main_data_td) for main_data_td in main_data_tds ] match_tbodies = tournament_table.find_all('tbody', recursive=False, id='line') for match_tbody in match_tbodies: match_trs = match_tbody.find_all('tr', recursive=False) if len(match_trs) == 0: continue main_data_tds = match_trs[0].find_all('td', recursive=False) main_data = [(main_data_names[i], get_tag_text(main_data_tds[i])) for i in range(len(main_data_names))] try: (time, home, away, special_word, additional, main_data_bets) = handle_main_data(main_data) except Exception: continue bets = [] bets += main_data_bets if len(match_trs) >= 2 and 't_comment' not in match_trs[1].find( 'td', recursive=False).get('class'): elements = match_trs[1].find('td', recursive=False).contents try: bets += handle_bets(elements, home=home, away=away) except Exception: pass # WARNING: Бывает (как минимум) еще одна строка raw_match_data = { 'tournament': tournament_name, 'date': tournament_date, 'time': time, 'home': home, 'away': away, 'special_word': special_word, 'bets': bets } raw_matches_data.append(raw_match_data) return raw_matches_data