def extract_citation_for_publication(link): """ this function craws the list of articles from a given link. If it has next page, it will continue to it until there is none @param[in] profile_url the link of google scholar profile you want to crawl @return the list of articles as a list where each entry is dictionary """ browser=Browser() citation={} # go the citation view # as the page is written is javascript, we are not able to get its content via urllib2 # intead we will use Selenium to simulate a web browser to render the page # req=urllib2.Request(publication[k]['link'], headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}) # p=urllib2.urlopen(req) # sub_soup=BeautifulSoup(p.readlines()[0], 'html.parser') # s=sub_soup.find(id='gs_ccl') browser.get(link) while True: citation_root=browser.find_element_by_id('gs_ccl') citation_list=citation_root.find_elements_by_class_name('gs_r') for citation_item in citation_list: # title title=citation_item.find_element_by_class_name('gs_rt').text # try to get the downloading link, if there is one try: link=citation_item.find_element_by_id('gs_ggsW2') link=link.find_element_by_link_text(link.text).get_attribute('href') except: link=None # author author_line=citation_item.find_element_by_class_name('gs_a') author_name=author_line.text.split(', ') author={} # for each of the author, find its link if its exits for a in author_name: try: print '.', # there is a google scholar profile with author item=author_line.find_element_by_link_text(a) author[a]=item.get_attribute('href') except: # there is not such profile author[a]=None # we can also press the cite button to get the detailed citation information, skipped here citation[title]={'link':link, 'author': author} # go to next page, if there is one if not next_page(browser): break # close browser.close() return citation
def extract_publication(profile_url, verbose=verbose_citation_list): """ this function crawl the publication list from the google scholar profile @param[in] profile_url the link of google scholar profile you want to crawl @param[in] verbose the level of information you want to scrawl. By default, we will scraw the detailed citation list for each of your publicaiton @return the list of pulication as a list, where each entry is a dictionary """ # scholar's artical list browser=Browser() browser.get(profile_url) publication={} while True: publication_list=browser.find_elements_by_class_name('gsc_a_tr') for publication_item in publication_list: title=publication_item.find_element_by_class_name('gsc_a_at').text print title, author=publication_item.find_elements_by_class_name('gs_gray')[0].text.split(', ') vendor=publication_item.find_elements_by_class_name('gs_gray')[1].text try: citation=int(publication_item.find_element_by_class_name('gsc_a_ac').text) link=publication_item.find_element_by_class_name('gsc_a_ac').get_attribute('href') except: citation=0 link=None try: year=int(publication_item.find_element_by_class_name('gsc_a_h').text) except: year=None if citation>0 and verbose>=verbose_citation_list: print 'and its citation list', cited_by=extract_citation_for_publication(link) else: cited_by=None print 'finished' publication[title]={'link':link,'author':author,'vendor':vendor,'citation':citation, 'cited by': cited_by, 'year':year} if not next_page(browser): break browser.close() return publication
class BaseTestCase(unittest.TestCase): """ Base Test Case which is inherited through all tests in order to provide proper webdriver workflow to set up and tear down test case groups. """ # some configuration defaults if the environment is started from Pycharm/Terminal BASE_LINK = "https://" try: BASE_LINK = env_config.get('url') except SystemExit: pass try: browser_env = os.environ["BROWSER_ENV"] except KeyError: # browser_env is empty if not running in terminal, therefore it Chrome is added as default here in code 4 PyChrm browser_env = "chrome" def get_base_link(self): try: return env_config.get('url') except SystemExit: return self.BASE_LINK def setUp(self): if self.browser_env == 'chrome': # this is the setup for working remotely with linux # in house just call self.driver = Chrome() # Use these commands if you don't want Chrome in headless mode options = webdriver.ChromeOptions() options.add_argument('--user-agent=piinctest') self.driver = webdriver.Chrome( executable_path='/usr/local/bin/chromedriver', options=options) # Use these commands for Chrome headless elif self.browser_env == 'headless': options = webdriver.ChromeOptions() options.add_argument('headless') options.add_argument("--window-size=1920x1080") self.driver = webdriver.Chrome( executable_path='/usr/local/bin/chromedriver', options=options) elif self.browser_env == 'firefox': profile = webdriver.FirefoxProfile() profile.set_preference("general.useragent.override", "piinctest") self.driver = Firefox(profile) # Use these commands for Firefox headless elif self.browser_env == 'firefoxHeadless': options = webdriver.FirefoxOptions() options.add_argument('-headless') options.add_argument("--window-size=1920x1080") self.driver = webdriver.Firefox( executable_path='/usr/local/bin/geckodriver', options=options) elif self.browser_env == "iexplorer": caps = DesiredCapabilities.INTERNETEXPLORER.copy() caps["ensureCleanSession"] = True # This is set as suggested default path, if you have different path, change it /usr/local/bin self.driver = Ie( executable_path="C:/webdrivers/iedriverserver.exe", capabilities=caps) self.driver.delete_all_cookies() try: self.driver.maximize_window() except AttributeError: self.driver.set_window_size(1920, 1200) except WebDriverException: self.driver.set_window_size(1920, 1200) self.driver.get(self.BASE_LINK) def tearDown(self): global result if hasattr(self, '_outcome'): # Python 3.4+ result = self.defaultTestResult() self._feedErrorsToResult(result, self._outcome.errors) if len(result.errors) > 0 or len(result.failures) > 0: fail_url = self.driver.current_url print(fail_url) now = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f') fn = os.path.join(os.path.dirname(__file__), '..', '..', 'screenshots/Screenshot_%s.png' % now) self.driver.get_screenshot_as_file(fn) print(str("Screenshot added at path: " + fn)) self.driver.close() self.driver.quit()
code = re.search('编号:\w+', room_type_code[i][j]).group() try: types = re.search('标准价', room_type_code[i][j]).group() except AttributeError: types = '钟点房(08:00~22:00)' price = re.search('\u00A5\d+', room_type_code[i][j]).group() info.append((code, types, price)) room_info[room_type[i]] = info hotel_info_text.write('-' * 32 + hotel[1] + '-' * 32 + '\n') for i in room_info: hotel_info_text.write(i + ':' + '\n') for j in room_info[i]: code = j[0] types = j[1] price = j[2] hotel_info_text.write(' ' * 4 + code + ' ' + types + ' ' + price + '\n') ie.close() ie.switch_to.window(windows) ie.find_element_by_id("txtKeyword").send_keys(Keys.CONTROL + 'a') ie.find_element_by_id("txtKeyword").send_keys(Keys.DELETE) for i in hotels: spider(i) time.sleep(5) hotel_info_text.close() ie.close()