def fill_form(driver, search_argument): driver.find_element_by_class_name('one-way-option').click() print('success click one way') driver.find_element_by_name('destination_from') #value 11 =PB, 12 = SRG, 13=LMB, 8=GA, 10=GT, 9=TK. worth separate it using class print('sucess click destination from') select.select_by_value('12') #worth to be input print('success selecting destination value') driver.find_element_by_name('destination_to') print('success selecting destination to') #worth separate it into class select.select_by_value('10') #worth to be input print('success selecting destination value') #make a class perhaps so that its easily customisable driver.find_element_by_name('depart_date') print('success choosing depart date') search_field_depart_date = driver.find_element_by_name('depart_date').click() #wait until date is available wait = WebDriverWait(driver, timeout=3).until( EC.presence_of_elements_located((By.CLASS_NAME, 'ui-datepicker-calendar'))) #select datepicker # calendar = driver.find_element_by_name('ui-datepicker-calendar') #find some date picker and fill in, worth to have a for loop or dependency injection?, class='ui-datepicker-month, class='ui-datepicker-year', class='ui-datepicker-calendar' <td class=" " data-handler="selectDay" data-event="click" data-month="7" data-year="2019"><a class="ui-state-default" href="#">5</a></td> #select.input[name=depart_date, value= '20 November 2019'] calendar = driver.find_element_by_name('ui-datepicker-calendar') search_field_adult = driver.find_element_by_id('select2-adult-4a-container') select.select_by_title('2') #look for search button and click it search_field_button = driver.find_element_by_class_name('search-avaibility').click()
def wait_for_presence(self, selector='', **kwargs): ''' Wait for an element to be present. (Does not need to be visible.) Parameters ---------- selector: str A CSS selector to search for. This can be any valid CSS selector. kwargs: Passed on to _wait_for ''' self._wait_for(EC.presence_of_element_located((By.CSS_SELECTOR, selector)) or EC.presence_of_elements_located((By.CSS_SELECTOR, selector)), **kwargs)
def process_request(self, request, spider): self.logger.debug('PhantomJS is Starting') page = request.meta.get('page', 1) try: self.browser.get(request.url) if page > 1: input = self.wait.until( EC.presence_of_elements_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input'))) submit = self.wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page) submit.click() self.wait.until( EC.text_to_be_present_in_element(( By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page))) return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200) except TimeoutException: return HtmlResponse(url=request.url, status=500, request=request)
def find_elements(self, locator, timeout=10): "定位一组元素,传递一个元祖类型,如('id','//[@class='one']')" elements = WebDriverWait(self.driver, timeout, 1).until( EC.presence_of_elements_located(locator)) return elements
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC browser = webdriver.Chrome() try: browser.get("https://www.baidu.com") input = browser.find_element_by_id('kw') input.send_keys('python') input.send_keys(Keys.ENTER) wait = WebDriverWait(browser, 10) wait.until(EC.presence_of_elements_located((By.ID, 'content_left'))) print(browser.current_url) print(browser.get_cookies()) print(browser.page_source) finally: browser.close()
def article_title(self, pages, related): for page_id in pages: if page_id == "jes": pass # print("in jes") page_content = requests.get(pages[page_id]).content self.soup = BeautifulSoup(page_content, 'html.parser') lists = [ i for i in self.soup.find("form", attrs={ "action": "/gca" }).findAll("div") if i.attrs["class"].count("toc-level") > 0 ] titles_jes = [] titles_jes_html = [] for i in lists: data_extracted = {} lists2 = [ j for j in i.findAll("div") if j.attrs["class"].count("toc-level") > 0 ] if lists2: if i.find("h2"): data_extracted.update({i.find("h2").text: {}}) # count_q = 0 for q in lists2: if q.find("h3"): data_extracted[i.find("h2").text].update( {q.find("h3").text: []}) for l in q.findAll( "h4", attrs={"class", "cit-title-group"}): data_extracted[i.find("h2").text][q.find( "h3").text].append( l.text.strip().replace("\n", "")) titles_jes.append(l.text.strip().replace( "\n", "")) titles_jes_html.append(l) else: # print("in h2 jes") if i.find("h2"): data_extracted.update({i.find("h2").text: []}) for l in i.findAll( "h4", attrs={"class", "cit-title-group"}): data_extracted[i.find("h2").text].append( l.text.strip().replace("\n", "")) titles_jes.append(l.text.strip().replace( "\n", "")) titles_jes_html.append(l) # data.update(data_extracted) # print(data) self.save_as_csv(titles_jes, pages[page_id], related) self.save_as_csv(titles_jes_html, pages[page_id], related, other=True) # print(titles_jes) # print("end of jes") elif page_id == "iopscience": pass # print("in iopscience") page_content = requests.get(pages[page_id]).content self.soup = BeautifulSoup(page_content, 'html.parser') lists_article_title = [ i.text.strip() for i in self.soup.findAll( "a", attrs={"class", "art-list-item-title"}) ] lists_article_title_html = [ i for i in self.soup.findAll( "a", attrs={"class", "art-list-item-title"}) ] # lists_authors = [i.text.strip() for i in self.soup.findAll("p",attrs={"class","small art-list-item-meta"})] # abstract_text = [i.find("p").text for i in self.soup.findAll("div",attrs={"class","article-text wd-jnl-art-abstract cf"})] # pdfs_link = [i.findAll("a")[2].attrs["href"] for i in self.soup.findAll("div",attrs={"class","art-list-item-tools small"}) if i.findAll("a",attrs={"class","mr-2 nowrap"}) ] # oa_or_not = [] # for i in self.soup.findAll("div",attrs={"class","eyebrow"}): # if i.findAll("a",attrs={"class","mr-2 nowrap"}): # oa_or_not.append({"OA":True}) # else: # oa_or_not.append({"OA":False}) # print(oa_or_not) # print(lists_article_title) self.save_as_csv(lists_article_title, pages[page_id], related) self.save_as_csv(lists_article_title_html, pages[page_id], related, other=True) # print("end of iopscience") elif page_id == "scrip": # driver_scrip = webdriver.Chrome(executable_path="/Users/dhaneesh.gk/Projects/own/web_import/extract_it/drivers/chromedriver") driver_scrip = self.chrome_driver driver_scrip.get(pages[page_id]) time.sleep(5) driver_scrip.refresh() WebDriverWait(driver_scrip, 20).until( EC.presence_of_elements_located(( By.XPATH, "//ul[div[contains(@id,'JournalInfor_Repeater_Papers')]]/p/a[@name]", "DOM content are not accessible right now"))) article_titles_scrip = [ i.text.strip() for i in driver_scrip.find_elements_by_xpath( "//ul[div[contains(@id,'JournalInfor_Repeater_Papers')]]/p/a[@name]" ) ] if not article_titles_scrip: article_titles_scrip.append( "Titels are not accessible from website right now") self.save_as_csv(article_titles_scrip, pages[page_id], related) elif page_id == "sciencedirect": pass page_content = requests.get(pages[page_id]).content self.soup = BeautifulSoup(page_content, 'html.parser') article_titels = [] article_titles_html = [] for i in self.soup.findAll( "h3", attrs={"class", "text-m u-display-inline"}): for j in i.findAll("span"): if j.attrs: if article_titels.count(j.text) == 0: article_titels.append(j.text) article_titles_html.append(j) # print(article_titels) self.save_as_csv(article_titels, pages[page_id], related) self.save_as_csv(article_titles_html, pages[page_id], related, other=True) elif page_id == "jsac": pass page_content = requests.get(pages[page_id]).content self.soup = BeautifulSoup(page_content, 'html.parser') articles = [] article_titels = [] article_titles_html = [] for i in self.soup.findAll("div", attrs={"class", "article"}): data_extracted = {} title = i.find("div", attrs={"class", "title"}).text title_html = i.find("div", attrs={"class", "title"}) authors = i.find("div", attrs={"class", "author"}).text journal = i.find("div", attrs={"class", "journal"}).text links = { j.text: "http://www.jsac.or.jp" + j.attrs["href"] for j in i.findAll("a") if "href" in j.attrs } image = "http://www.jsac.or.jp" + i.find( "img").attrs["src"] data_extracted.update({ "title": title, "authors": authors, "journal": journal, "links": links, "image": image }) article_titels.append(title) article_titles_html.append(title_html) articles.append(data_extracted) # print(article_titels) self.save_as_csv(article_titels, pages[page_id], related) self.save_as_csv(article_titles_html, pages[page_id], related, other=True)