Beispiel #1
0
def main():

    html = grab_html_by_class(init_driver(), class_name="team", url=URL)
    soup = BeautifulSoup(html, "html.parser")

    club_list = soup.find('tbody', attrs={'class': 'allTimeDataContainer'})

    print('{ "clubs" :[')

    for row in club_list.findAll('tr'):

        stadium = row.find('td', attrs={'class': 'venue'}).get_text().strip()
        club_column = row.find('td', attrs={'class': 'team'})
        club = club_column.get_text().strip()
        club_id = club_column.find('a')['href'].split('/')[4].strip()

        print('{"club_id":"', club_id, '", "club": "', club, '", "stadium": "',
              stadium, '"},')

    print(']}')
Beispiel #2
0
def main():

    html = grab_html_by_class(init_driver(), class_name="managerName", url=URL)
    soup = BeautifulSoup(html, "html.parser")

    managers_list = soup.find('tbody', attrs={'class': 'dataContainer'})

    print('{ "managers": [')
    for row in managers_list.findAll('tr'):

        column_index = 0
        manager_name, club_name_long, club_name_short = '', '', ''
        for column in row.findAll('td'):

            # Manager column
            if column_index == 0:
                manager_name = column.get_text()
            # Team name
            elif column_index == 1:
                club_name_long = column.find('span', attrs={
                    'class': 'long'
                }).get_text()
                club_name_short = column.find('span', attrs={
                    'class': 'short'
                }).get_text()
            else:
                break

            column_index += 1

        print('{')
        print(' "manager": "', manager_name, '",')
        print('"club_short": "', club_name_short, '",')
        print('"club_long": "', club_name_long, '"')
        print('},')

    print(']}')
Beispiel #3
0
get_link = True ####&&&&
#get_link = False


get_data = (not get_link) # either get_link or get_data

if get_link or get_data:

	# 3- initialize website

	website = "https://www.glassdoor.com/index.htm"

	# Initialize the webdriver

	browser = init_driver()
	#browser = webdriver.Chrome()

# 4- Scrape for links and brief data


if get_link :
	iter_num = 0
	while iter_num <3: # default 1 ####&&&&
		print('Starting iteration number {}'.format(iter_num))
		sleep(get_pause())
		browser.get(website)

		# Initialize cities and jobs

		jobName_lst = ['Data Scientist', 'Data Analyst','Data Engineer']
Beispiel #4
0
 def setUpClass(cls):
     print("test starting")
     print(__doc__)
     cls.driver = helper.init_driver()
     time.sleep(4)
Beispiel #5
0
def initializaion():
	printo('INIZIALIZATION','center',True)
	info_OS()
	check_internet()
	init_driver()
	printo('FINISH INIZIALIZATION','center',True)
Beispiel #6
0
 def setUpClass(cls):
     cls.driver = helper.init_driver()
     time.sleep(2)
Beispiel #7
0
def main():

    driver = init_driver()

    clubs = {}
    my_file = Path(CLUBS)
    if not my_file.is_file():

        class_name = "dropDown"
        html = grab_html_by_class(driver, class_name, URL, leave_open=True)
        soup = BeautifulSoup(html, "html.parser")

        # Filter available clubs and their API ids
        club_dropdown = soup.find('ul', attrs={'data-dropdown-list': 'clubs'})
        ignore_first_line = True
        clubs = {}
        for row in club_dropdown.findAll('li'):
            if ignore_first_line:
                ignore_first_line = False
            else:
                clubs[row.get_text()] = int(row['data-option-id'])

        with open(CLUBS, 'w') as outfile:
            values = [{"club": k, "api": v} for k, v in clubs.items()]
            json.dump(values, outfile, indent=4)

    else:
        clubs = get_clubs()

    for club_name, api_id in clubs.items():

        club_players = {}

        players = []
        url = URL + '?se=79&cl={}'.format(api_id)
        class_name = "playerName"
        html = grab_html_by_class(driver, class_name, url, leave_open=True)
        soup = BeautifulSoup(html, "html.parser")

        managers_list = soup.find('tbody', attrs={'class': 'dataContainer'})
        for row in managers_list.findAll('tr'):

            player = {}
            column_index = 0
            player_name, position = '', ''

            for column in row.findAll('td'):

                if column_index == 0:
                    player_name = column.get_text()
                    player_link = column.find('a')['href']
                    player_link = 'https:{}'.format(player_link)
                elif column_index == 1:
                    position = column.get_text()
                else:
                    break

                column_index += 1

                # end for

            player_html = grab_html_by_class(
                init_driver(), 'number', player_link, leave_open=False, scroll=True)
            playersoup = BeautifulSoup(player_html, "html.parser")
            player_number = -1
            try:
                player_number = playersoup.find('div', attrs={'class': 'number'}).get_text()
            except AttributeError:
                pass

            column_index = 0
            personal_lists = playersoup.find('div', attrs={'class': 'personalLists'})
            nationality, age, dob, height, weight = '', '', '', '', ''

            try:
                for detail in personal_lists.findAll('div', attrs={'class': 'info'}):
                    if column_index == 0:
                        nationality = detail.get_text().replace('\n', '')
                    elif column_index == 1:
                        age = detail.get_text()
                    elif column_index == 2:
                        dob = detail.get_text()
                    elif column_index == 3:
                        height = detail.get_text()
                    elif column_index == 4:
                        weight = detail.get_text()
                    else:
                        break

                    column_index += 1

            except AttributeError:
                pass

            print("Visit complete")

            player["name"] = player_name
            player["position"] = position
            player["shirt_number"] = player_number
            player["nationality"] = nationality
            player["age"] = age
            player["dob"] = dob
            player["height"] = height
            player["weight"] = weight

            players.append(player)
            # end for

        club_players[club_name] = players

        path = '{}.json'.format(club_name.replace(' ', '_'))

        with open(PLAYERS, 'w') as outfile:
            values = [{"team": k, "players": v} for k, v in club_players.items()]
            json.dump(values, outfile, ensure_ascii=False, indent=4)
            print('Writing to JSON complete')

    # end for
    driver.quit()