def GetList(driver, site, cate): # 리스트 생성 indexList = [] max_page = site['MAX_PAGE'] if 'MAX_PAGE' in site else 1 for page in range(1, max_page + 1): print('PAGE : %s' % page) if 'SITE_TYPE' not in site: u = '%s/bbs/board.php?bo_table=%s&page=%s' % ( site['TORRENT_SITE_URL'], cate, page) else: u = site['BO_TABLE_URL'] % cate if 'QUERY' in site: u += site['QUERY'] print('URL : %s' % u) driver.get(u) list_tag = site['XPATH_LIST_TAG'][:site['XPATH_LIST_TAG'].find('[%s]')] list = WebDriverWait( driver, 3).until(lambda driver: driver.find_elements_by_xpath(list_tag)) step = 1 if 'STEP' not in site else site['STEP'] start = 1 if 'START_INDEX' not in site else site['START_INDEX'] for i in range(start, len(list) + 1, step): try: a = WebDriverWait( driver, 3).until(lambda driver: driver.find_element_by_xpath(site[ 'XPATH_LIST_TAG'] % i)) if a.get_attribute('href').find(cate) == -1: continue item = {} item['title'] = a.text.strip() item['detail_url'] = a.get_attribute('href') indexList.append(item) except: print('NOT BBS : %s' % i) exc_info = sys.exc_info() traceback.print_exception(*exc_info) # 세부 페이지에서 링크 추출 list = [] for item in indexList: print('URL : %s' % item['detail_url']) driver.get(item['detail_url']) if 'HOW' not in site or site['HOW'] != 'USING_MAGNET_REGAX': try: # TODO if site['TORRENT_SITE_TITLE'] == 'tfreeca': driver.switch_to_frame("external-frame") if 'HOW' in site and site['HOW'] == 'INCLUDE_MAGNET_IN_INPUT': link_element = WebDriverWait( driver, 10).until(lambda driver: driver.find_elements_by_xpath( "//input[starts-with(@value,'magnet')]")) else: link_element = WebDriverWait( driver, 10).until(lambda driver: driver.find_elements_by_xpath( "//a[starts-with(@href,'magnet')]")) for magnet in link_element: if 'HOW' in site and site[ 'HOW'] == 'INCLUDE_MAGNET_IN_LIST_AND_INCLUDE_LIST_ON_VIEW': if not magnet.text.startswith('magnet'): break if 'HOW' in site and site[ 'HOW'] == 'INCLUDE_MAGNET_IN_INPUT': entity = {} entity['title'] = item['title'] entity['link'] = magnet.get_attribute('value') print entity['link'] list.append(entity) try: print('TITLE : %s\nLINK : %s' % (entity['title'], entity['link'])) except: pass continue idx2 = 0 # torrentao 에서 magnet이 붙어있다 while True: idx1 = magnet.get_attribute('href').find( 'magnet:?xt=urn', idx2) idx2 = magnet.get_attribute('href').find( 'magnet:?xt=urn', idx1 + 1) if idx2 == -1: idx2 = len(magnet.get_attribute('href')) # 중복검사 entity = {} entity['title'] = item['title'] entity['link'] = magnet.get_attribute( 'href')[idx1:idx2] flag = False for tmp in list: if tmp['link'] == entity['link']: flag = True break if flag == False: list.append(entity) try: print('TITLE : %s\nLINK : %s' % (entity['title'], entity['link'])) except: pass if idx2 == len(magnet.get_attribute('href')): break except: exc_info = sys.exc_info() traceback.print_exception(*exc_info) elif site['HOW'] == 'USING_MAGNET_REGAX': try: regax = re.compile(site['MAGNET_REGAX'], re.IGNORECASE) #match = regax.search(driver.page_source) match = regax.findall(driver.page_source) for m in match: entity = {} entity['title'] = item['title'] entity['link'] = site['MAGNET_MAKE_URL'] % m list.append(entity) try: print('TITLE : %s\nLINK : %s' % (entity['title'], entity['link'])) except: pass except: exc_info = sys.exc_info() traceback.print_exception(*exc_info) # 첨부파일 다운로드 if 'DOWNLOAD_FILE' in site and site['DOWNLOAD_FILE'] is 'ON': try: if 'DOWNLOAD_REGEX' not in site: tmp = '%s/bbs/download.php' % site['TORRENT_SITE_URL'] link_element = WebDriverWait( driver, 5).until(lambda driver: driver.find_elements_by_xpath( "//a[starts-with(@href,'%s')]" % tmp)) else: link_element = WebDriverWait( driver, 5).until(lambda driver: driver.find_elements_by_xpath( "//a[contains(@href,'bbs/download.php')]")) for a_tag in link_element: flag = False filename = '' if 'DOWNLOAD_REGEX' not in site: tmp = a_tag.text.replace('\n', ' ').replace('\r', '') url = a_tag.get_attribute('href') else: regax = re.compile(site['DOWNLOAD_REGEX'], re.IGNORECASE) match = regax.search(a_tag.get_attribute('href')) if not match: continue tmp = match.group('filename') url = match.group('url') idx = url.find('bbs/download.php') url = site['TORRENT_SITE_URL'] + '/' + url[idx:] for ext in ['.torrent', '.smi', '.srt', '.ass']: idx = tmp.find(ext) if idx != -1: flag = True if ext != '.torrent': filename = tmp[:idx + len(ext)] filename = filename.replace('\\', ' ').replace( '/', ' ').replace(':', ' ').replace( '*', ' ').replace('?', ' ').replace( '"', ' ').replace('<', ' ').replace( '>', ' ').replace('|', ' ') break if flag and filename is not '': print('DOWNLOAD : %s' % filename) download( driver, url, filename, site['DOWNLOAD_PATH'] if 'DOWNLOAD_PATH' in site else None) except: exc_info = sys.exc_info() traceback.print_exception(*exc_info) pass if 'SLEEP' in site: time.sleep(site['SLEEP']) return list
def GetList(driver, site, cate): # 리스트 생성 indexList = [] for page in range(1, site['MAX_PAGE'] + 1): print('PAGE : %s' % page) u = '%s/bbs/board.php?bo_table=%s&page=%s' % (site['TORRENT_SITE_URL'], cate, page) print('URL : %s' % u) driver.get(u) list_tag = site['XPATH_LIST_TAG'][:site['XPATH_LIST_TAG'].find('[%s]')] list = WebDriverWait( driver, 3).until(lambda driver: driver.find_elements_by_xpath(list_tag)) step = 1 if 'STEP' not in site else site['STEP'] for i in range(1, len(list) + 1, step): #for i in range(1, 6): try: a = WebDriverWait( driver, 3).until(lambda driver: driver.find_element_by_xpath(site[ 'XPATH_LIST_TAG'] % i)) if a.get_attribute('href').find(cate) == -1: continue #a = WebDriverWait(driver, 3).until(lambda driver: driver.find_element_by_xpath('')) item = {} item['title'] = a.text item['detail_url'] = a.get_attribute('href') indexList.append(item) except: print('NOT BBS : %s' % i) exc_info = sys.exc_info() traceback.print_exception(*exc_info) # 세부 페이지에서 링크 추출 list = [] for item in indexList: print('URL : %s' % item['detail_url']) driver.get(item['detail_url']) if 'HOW' not in site: try: link_element = WebDriverWait( driver, 10).until(lambda driver: driver.find_elements_by_xpath( "//a[starts-with(@href,'magnet')]")) for magnet in link_element: print('HREF : %s' % magnet.get_attribute('href')) idx2 = 0 # torrentao 에서 magnet이 붙어있다 while True: idx1 = magnet.get_attribute('href').find( 'magnet:?xt=urn', idx2) idx2 = magnet.get_attribute('href').find( 'magnet:?xt=urn', idx1 + 1) if idx2 == -1: idx2 = len(magnet.get_attribute('href')) # 중복검사 entity = {} entity['title'] = item['title'] entity['link'] = magnet.get_attribute( 'href')[idx1:idx2] flag = False for tmp in list: if tmp['link'] == entity['link']: flag = True break if flag == False: list.append(entity) print('TITLE : %s\nLINK : %s' % (entity['title'], entity['link'])) if idx2 == len(magnet.get_attribute('href')): break except: exc_info = sys.exc_info() traceback.print_exception(*exc_info) elif site['HOW'] == 'USING_MAGNET_REGAX': try: regax = re.compile(site['MAGNET_REGAX'], re.IGNORECASE) #match = regax.search(driver.page_source) match = regax.findall(driver.page_source) for m in match: entity = {} entity['title'] = item['title'] entity['link'] = site['MAGNET_MAKE_URL'] % m list.append(entity) print('TITLE : %s\nLINK : %s' % (entity['title'], entity['link'])) except: exc_info = sys.exc_info() traceback.print_exception(*exc_info) # 첨부파일 다운로드 if 'DOWNLOAD_FILE' in site and site['DOWNLOAD_FILE'] is 'ON': try: tmp = '%s/bbs/download.php' % site['TORRENT_SITE_URL'] link_element = WebDriverWait( driver, 10).until(lambda driver: driver.find_elements_by_xpath( "//a[starts-with(@href,'%s')]" % tmp)) for a_tag in link_element: tmp = a_tag.text.replace('\n', ' ').replace('\r', '') flag = False filename = '' for ext in ['.torrent', '.smi', '.srt', '.ass']: idx = tmp.find(ext) if idx != -1: flag = True if ext != '.torrent': filename = tmp[:idx + len(ext)] break if flag and filename is not '': print('DOWNLOAD : %s' % filename) download(driver, a_tag.get_attribute('href'), filename) except: exc_info = sys.exc_info() traceback.print_exception(*exc_info) pass return list
mercato = WebDriverWait(driver, 5).until( EC.presence_of_element_located( (By.XPATH, "/html/body/div[2]/header/div/div[2]/ul/li[3]/a"))) ActionChains(driver).move_to_element(mercato).perform() quotazioni = WebDriverWait(driver, 5).until( EC.element_to_be_clickable( (By.XPATH, "/html/body/div[2]/header/div/div[2]/ul/li[3]/div/ul/li[1]/a" ))).click() table = WebDriverWait(driver, 5).until( EC.presence_of_element_located( (By.XPATH, "/html/body/div[2]/div[1]/div[3]/div/div[1]/div[2]/div[2]/table"))) rows = table.find_elements(By.TAG_NAME, "tr") table = [] for j in range(1, len(rows)): cells = rows[j].find_elements(By.TAG_NAME, "td") cellslist = [] for i in range(0, len(cells)): if i == 3 or i == 5: continue cellslist.append(cells[i].text) #da salvare in appropriata struttura table.append(cellslist) table = pd.DataFrame(table) ct = datetime.datetime.now().strftime("%d-%m-%y_%H-%M-%S") table.to_csv(sys.path[0] + '/' + str(ct) + ".csv") ######################################################################################## #ANALISI DATI #######################################################################################
livros = WebDriverWait(browser, 60).until( EC.presence_of_all_elements_located(( By.XPATH, "/html/body/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/a/section/div/div/h2" ))) precos = WebDriverWait(browser, 60).until( EC.presence_of_all_elements_located(( By.XPATH, "/html/body/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/a/section/div/div/div" ))) a = precos[2] precos.append(a) l = 2 for livro in livros: if precos[l].text == "": arquivo.write( str( str(nome_categoria.upper()) + "|" + str(nome_subcategoria.upper()) + "|" + str(livro.text) + "| Produto fora de estoque\n")) print( str(nome_categoria.upper()) + "|" +