-
Notifications
You must be signed in to change notification settings - Fork 0
/
results_scrapper.py
110 lines (88 loc) · 3.7 KB
/
results_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from game import Game
import traceback
import time
# returns the configured chrome web driver (the download folder is not working properly. Downloads to default)
def getChromeDriver(driver_location='/Users/aruki/code/chromedriver', download_folder='./pdf/'):
# chrome options for download
prefs = {
"download.default_directory": download_folder,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True,
}
chrome_options = Options()
chrome_options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(driver_location, options=chrome_options)
return driver
# navigate website to gather the Game match links
def getMatchLinks(webdriver):
# find the button for next pagination
next_button = webdriver.find_element_by_xpath(
"//*[@id='tie-block_2186']/div/div[1]/div/div/ul/li[2]/a")
# results = driver.find_elements_by_xpath("//li/h3/a")
# print('cantidad de resultados: {}'.format(len(results)))
partidos = []
loop = True
cont = 1
# loop the results
while loop:
wait = WebDriverWait(webdriver, 10)
next_button = wait.until(
EC.element_to_be_clickable((By.CLASS_NAME, 'next-posts')))
# next_button = driver.find_element_by_class_name('next-posts')
loop = not ('pagination-disabled' in next_button.get_attribute('class'))
#print(next_button.get_attribute('class') + ' | ' + str(loop))
results = webdriver.find_elements_by_xpath("//li/h3/a")
print(' {} results in page {} !'.format(str(len(results)), cont))
for result in results:
partido = Game(result.text, result.get_attribute('href'))
partidos.append(partido)
if loop:
next_button.click()
# wait for the transition animation to finish loading results
time.sleep(6)
webdriver.implicitly_wait(6)
cont += 1
return partidos
def downloadGameResults(webdriver, partidos):
print('Games to download: {}'.format(len(partidos)))
cont = 1
total = len(partidos)
# we loop through the gathered download links
for partido in partidos:
print('Downloading Game result {}/{} {}'.format(str(cont),
str(total), partido.titulo))
try:
cont += 1
webdriver.get(partido.link)
time.sleep(2)
# if not ('P.Pospuesto' in partido.titulo):
download = webdriver.find_element_by_class_name(
's_pdf_download_link')
partido.download_link = download.get_attribute('href')
webdriver.get(download.get_attribute('href'))
partido.downloaded = True
except:
#partido.download_link = ''
partido.downloaded = False
print('Encountered an issue downloading results for {}'.format(
partido.titulo))
# traceback.print_exc()
return partidos
# dump the list of Games and download links to a csv file
def make_csv_file(results, file='results.csv', echo=False):
with open(file, 'w') as f:
headers = 'match,result_link,download_link,downloaded\n'
if echo:
print(headers + '\n')
f.write(headers)
for result in results:
line = result.getValuesCSV() + '\n'
if echo:
print(line)
f.write(line)