def main(): from selenium import webdriver global urlPages, outputFolder urlPages, settings = GUI('SportStats') outputFolder = settings['Path'] lenurls = len(urlPages) if lenurls > 0: driver = webdriver.Chrome() for i, urlPage in enumerate(urlPages): urlInfo = 'URL: ' + urlPage initialTime = int(time.time()) while True: try: PageScrapping(driver, urlInfo, urlPage) break except: if int(time.time()) - initialTime > 30: GUIChangeError(urlInfo + '\n Runtime ALOK Error - 202') driver.quit() GUIKill() if lenurls > 0: driver.quit() GUIKill() return
def CollectInnerData(driver): import pandas as pd data = None innerXpath = "//div[@id='athlete-popup']" void = 0 innerTableStyleOld = CollectTable(driver, innerXpath)[0].get_attribute('style') innerTableHTMLOld = CollectTable(driver, innerXpath)[0].get_attribute('innerHTML') wait_clickability_element(driver, "//tr[@role='row']//td[4]//a") namesHTML = driver.find_elements_by_xpath("//tr[@role='row']//td[4]//a") for i, nameHTML in enumerate(namesHTML): try: driver.execute_script("arguments[0].click();", nameHTML) except: if data is None: void += 1 else: data = data.append(pd.Series(), ignore_index=True) else: initialTime = int(time.time()) while True: try: innerTableHTML = CollectTable( driver, innerXpath)[0].get_attribute('innerHTML') innerTableStyle = CollectTable( driver, innerXpath)[0].get_attribute('style') except: continue if (innerTableHTMLOld != innerTableHTML) or ( innerTableStyleOld != innerTableStyle): innerTableStyleOld = innerTableStyle innerTableHTMLOld = innerTableHTML innerDataRow = TableToData(innerTableHTML)[0].set_index( 0).T if data is None: data = innerDataRow if void != 0: for _ in range(void): data = data.append(pd.Series(), ignore_index=True) else: data = data.append(innerDataRow) break if int(time.time()) - initialTime > 100: GUIChangeError('Runtime Error - 83') driver.quit() GUIKill() return data
def PageScrapping(driver, urlInfo, urlPage): driver = CollectData(driver, urlPage) data = None pageNumber = 1 # Loop to identify where should the script go for another page while (len(driver.find_elements_by_xpath("//tr[@role='row']//td[4]//a")) > 0): GUIChangeStatus(urlInfo + ' Page: ' + str(pageNumber)) viewbtnHTML = driver.find_elements_by_xpath( "//tr[@role='row']//div[contains(@aria-expanded, 'true')]") for viewbtn in viewbtnHTML: driver.execute_script("arguments[0].click();", viewbtn) time.sleep(1) data = CollectContentPage(data, driver) firstLineHTMLOld = driver.find_elements_by_xpath( "//div[@class='ui-datatable-tablewrapper']")[0].get_attribute( 'innerHTML') nxtbtnHTML = driver.find_elements_by_xpath( "//div[@id='mainForm:pageNav']//a[contains(@class, 'fa-angle-right')]" ) if len(nxtbtnHTML) > 0: driver.execute_script("arguments[0].click();", nxtbtnHTML[0]) pageNumber += 1 initialTime = int(time.time()) while True: try: firstLineHTML = driver.find_elements_by_xpath( "//div[@class='ui-datatable-tablewrapper']" )[0].get_attribute('innerHTML') except: continue if firstLineHTMLOld != firstLineHTML: time.sleep(1) break if int(time.time()) - initialTime > 10: GUIChangeError('Runtime Error - 170') driver.quit() GUIKill() else: break data = ProcessData(data) CreateFile(urlPage, data, driver)
def CollectContent(data,driver): results = driver.find_elements_by_xpath("//div[contains(@class,'link-to-irp')]/div") if data is None: data = CollectHeader(driver) num_columns = len(data) for i,result in enumerate(results): try: data[i%num_columns].append(result.text) except: GUIChangeError('Runtime Error - 67') return data
def CreateFile(urlPage, data, driver): global outputFolder raceTitleHTML = driver.find_elements_by_xpath("//div[@id='main']//h1[1]") raceTitle = raceTitleHTML[0].text raceDateTypeHTML = driver.find_elements_by_xpath("//div[@id='main']//p[1]") try: [raceDate, raceType] = re.split(r"•", raceDateTypeHTML[0].text) fileString = raceTitle + "_" + raceDate + "_" + raceType except: fileString = raceTitle GUIChangeError("Procedure Error - 127") fileString = fileString.replace('/', '-') fileName = outputFolder + fileString + ".txt" heading = urlPage + '\n' + raceTitle + '\n' + raceDateTypeHTML[ 0].text + '\n\nline 1\nline 2\nline 3\nline 4\n' CreateFinalFile(fileName, data, heading)
def main(): global urlPages,outputFolder urlPages,settings = GUI('AthLinks') outputFolder = settings['Path'] lenurls = len(urlPages) if lenurls>0: chrome_options = webdriver.ChromeOptions() #chrome_options.add_argument("headless") #chrome_options.add_argument("--window-size=1920,1080") chrome_options.add_argument("--start-maximized") chrome_options.add_argument('--log-level=3') driver = webdriver.Chrome(options=chrome_options) driver.minimize_window() for i,urlPage in enumerate(urlPages): urlInfo = 'URL: '+urlPage initialTime = int(time.time()) while True: try: PageScrapping(driver,urlInfo,urlPage) break except: if int(time.time())-initialTime>30: GUIChangeError(urlInfo+'\n Runtime Error - 186') driver.quit() GUIKill() if lenurls>0: driver.quit() GUIKill() return