def main(): html = grab_html_by_class(init_driver(), class_name="team", url=URL) soup = BeautifulSoup(html, "html.parser") club_list = soup.find('tbody', attrs={'class': 'allTimeDataContainer'}) print('{ "clubs" :[') for row in club_list.findAll('tr'): stadium = row.find('td', attrs={'class': 'venue'}).get_text().strip() club_column = row.find('td', attrs={'class': 'team'}) club = club_column.get_text().strip() club_id = club_column.find('a')['href'].split('/')[4].strip() print('{"club_id":"', club_id, '", "club": "', club, '", "stadium": "', stadium, '"},') print(']}')
def main(): html = grab_html_by_class(init_driver(), class_name="managerName", url=URL) soup = BeautifulSoup(html, "html.parser") managers_list = soup.find('tbody', attrs={'class': 'dataContainer'}) print('{ "managers": [') for row in managers_list.findAll('tr'): column_index = 0 manager_name, club_name_long, club_name_short = '', '', '' for column in row.findAll('td'): # Manager column if column_index == 0: manager_name = column.get_text() # Team name elif column_index == 1: club_name_long = column.find('span', attrs={ 'class': 'long' }).get_text() club_name_short = column.find('span', attrs={ 'class': 'short' }).get_text() else: break column_index += 1 print('{') print(' "manager": "', manager_name, '",') print('"club_short": "', club_name_short, '",') print('"club_long": "', club_name_long, '"') print('},') print(']}')
get_link = True ####&&&& #get_link = False get_data = (not get_link) # either get_link or get_data if get_link or get_data: # 3- initialize website website = "https://www.glassdoor.com/index.htm" # Initialize the webdriver browser = init_driver() #browser = webdriver.Chrome() # 4- Scrape for links and brief data if get_link : iter_num = 0 while iter_num <3: # default 1 ####&&&& print('Starting iteration number {}'.format(iter_num)) sleep(get_pause()) browser.get(website) # Initialize cities and jobs jobName_lst = ['Data Scientist', 'Data Analyst','Data Engineer']
def setUpClass(cls): print("test starting") print(__doc__) cls.driver = helper.init_driver() time.sleep(4)
def initializaion(): printo('INIZIALIZATION','center',True) info_OS() check_internet() init_driver() printo('FINISH INIZIALIZATION','center',True)
def setUpClass(cls): cls.driver = helper.init_driver() time.sleep(2)
def main(): driver = init_driver() clubs = {} my_file = Path(CLUBS) if not my_file.is_file(): class_name = "dropDown" html = grab_html_by_class(driver, class_name, URL, leave_open=True) soup = BeautifulSoup(html, "html.parser") # Filter available clubs and their API ids club_dropdown = soup.find('ul', attrs={'data-dropdown-list': 'clubs'}) ignore_first_line = True clubs = {} for row in club_dropdown.findAll('li'): if ignore_first_line: ignore_first_line = False else: clubs[row.get_text()] = int(row['data-option-id']) with open(CLUBS, 'w') as outfile: values = [{"club": k, "api": v} for k, v in clubs.items()] json.dump(values, outfile, indent=4) else: clubs = get_clubs() for club_name, api_id in clubs.items(): club_players = {} players = [] url = URL + '?se=79&cl={}'.format(api_id) class_name = "playerName" html = grab_html_by_class(driver, class_name, url, leave_open=True) soup = BeautifulSoup(html, "html.parser") managers_list = soup.find('tbody', attrs={'class': 'dataContainer'}) for row in managers_list.findAll('tr'): player = {} column_index = 0 player_name, position = '', '' for column in row.findAll('td'): if column_index == 0: player_name = column.get_text() player_link = column.find('a')['href'] player_link = 'https:{}'.format(player_link) elif column_index == 1: position = column.get_text() else: break column_index += 1 # end for player_html = grab_html_by_class( init_driver(), 'number', player_link, leave_open=False, scroll=True) playersoup = BeautifulSoup(player_html, "html.parser") player_number = -1 try: player_number = playersoup.find('div', attrs={'class': 'number'}).get_text() except AttributeError: pass column_index = 0 personal_lists = playersoup.find('div', attrs={'class': 'personalLists'}) nationality, age, dob, height, weight = '', '', '', '', '' try: for detail in personal_lists.findAll('div', attrs={'class': 'info'}): if column_index == 0: nationality = detail.get_text().replace('\n', '') elif column_index == 1: age = detail.get_text() elif column_index == 2: dob = detail.get_text() elif column_index == 3: height = detail.get_text() elif column_index == 4: weight = detail.get_text() else: break column_index += 1 except AttributeError: pass print("Visit complete") player["name"] = player_name player["position"] = position player["shirt_number"] = player_number player["nationality"] = nationality player["age"] = age player["dob"] = dob player["height"] = height player["weight"] = weight players.append(player) # end for club_players[club_name] = players path = '{}.json'.format(club_name.replace(' ', '_')) with open(PLAYERS, 'w') as outfile: values = [{"team": k, "players": v} for k, v in club_players.items()] json.dump(values, outfile, ensure_ascii=False, indent=4) print('Writing to JSON complete') # end for driver.quit()