def get_attraction_urls(search_results): urls = [] page = PyQuery(search_results, parser='html') for item in page(".result").items(): attribute = PyQuery(item).children().attr['onclick'] urls.append(attribute[attribute.index('\'/') + 2: attribute.index('\')')]) if len(urls) == 0: for item in page(".listing_title").items(): attribute = PyQuery(item).children().attr['href'] urls.append(attribute[1:]) return urls
def get_attraction_urls(search_results): urls = [] page = PyQuery(search_results, parser='html') for item in page(".result").items(): attribute = PyQuery(item).children().attr['onclick'] urls.append(attribute[attribute.index('\'/') + 2:attribute.index('\')')]) if len(urls) == 0: for item in page(".listing_title").items(): attribute = PyQuery(item).children().attr['href'] urls.append(attribute[1:]) return urls
def get_info(detail_url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", } try: res = requests.get(detail_url, headers=headers).text print(res) nonce = re.findall('window\["n.*?e"\]\s=\s(.*?);', res)[1] data = re.findall('var DATA.*?\'(.*?)\'', res)[0] chapter = re.findall('title-comicHeading\">(.*?)<', res)[0].replace(' ', '') chapters = PyQuery(res)( '#catalogueList li .tool_chapters_list_title').text().split() chapter = str(chapters.index(chapter) + 1).rjust(3, '0') + chapter chapters = [ str(chapters.index(chapter) + 1).rjust(3, '0') + chapter for chapter in chapters ] name = re.findall('<title>《(.*?)》', res)[0] return (nonce, data, chapter, chapters, name) except Exception as er: print(detail_url, er)
def verify_token(self): for x in range(5): try: WebDriverWait(self.driver, wait).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-setting > div > div.card-body.d-flex.flex-column > div.overflow-container.flex-grow-1 > ul > li"))) HTML = self.driver.find_elements_by_css_selector(".card-control")[ 0].get_attribute("innerHTML") Doc = PQ(HTML) Doc = Doc('.list-group-item-action').text() Doc = Doc.replace(" ", "\n") Doc = Doc.split("\n") # print(Doc) path = Doc.index("表名") pathh = '//*[@id="dp_ads.' + Doc[path+1] +'"]' self.driver.find_element_by_xpath(pathh).click() break except: time.sleep(3) return "dp_ads." + Doc[path+1]
def click_dataset(self,lan): #---PyQuery→Xpath--- for x in range(5): try: HTML = self.driver.find_elements_by_css_selector(".card-control")[ 0].get_attribute("innerHTML") Doc = PQ(HTML) Doc = Doc('.list-group-item-action').text() Doc = Doc.replace(" ", "\n") Doc = Doc.split("\n") # print(Doc) path = Doc.index(lan) pathh = '//*[@id="dp_ads.' + Doc[path+1] +'"]' self.driver.find_element_by_xpath(pathh).click() break except: time.sleep(3) #對照頁面上的→維度條件 WebDriverWait(self.driver, wait).until( EC.visibility_of_element_located((By.CSS_SELECTOR, "body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-fitler > div > div.card-body > h3:nth-child(3)"))) check = self.driver.find_element_by_css_selector("body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-fitler > div > div.card-body > h3:nth-child(3)").get_attribute("innerText") return check