def find_google_scholar_page(self, prof: Professor): # get search results page self.get_page( 'https://scholar.google.com/scholar?q=author%%3A"%s"+%s' % (urllib.parse.quote(prof.simple_name()), prof.school)) # look for a matching user profile try: anchor = self.selenium_driver.find_element_by_css_selector( 'h4.gs_rt2 a') return anchor.get_attribute('href') except NoSuchElementException: return None
def scrape_search_results(self, prof: Professor) -> List[Paper]: """In this case, we are saving all articles, even if we are not sure that they match the author. We only search in the past ten years (2007 and later) and only include the first 100 pages of results, and only papers that have at least one citation in Google Scholar (to save us some time).""" # parse each page of results, up to at most 1000 articles (100 pages) papers = [] # for each page of results for start in range(0, 1000, 10): result_row_info = [] # get search results page self.get_page( 'https://scholar.google.com/scholar?start=%d&as_ylo=%s&q=author%%3A"%s"+%s' % (start, STARTING_YEAR, urllib.parse.quote( prof.simple_name()), prof.school)) # We get the GS and WoS citation counts from the search results page # We get the full citation information by virtually clicking the "cite" link for each article tree = tree_from_string(self.selenium_driver.page_source) for row in css_select(tree, 'div.gs_r div.gs_ri'): scholar_citations = None wos_citations = None citation_id = None for link in css_select(row, 'div.gs_fl a'): if 'Cited by' in link.text: scholar_citations = link.text.split(' ')[-1] elif 'Web of Science:' in link.text: wos_citations = link.text.split(': ')[-1] elif 'Related articles' in link.text: citation_id = link.get('href').split(":")[1] # ignore papers with no citations if not scholar_citations: break result_row_info.append({ 'scholar_citations': scholar_citations, 'wos_citations': wos_citations, 'citation_id': citation_id }) # stop when we've gone past the end of results if len(result_row_info) == 0: break # fetch each citation and pick out the Chicago format because it has full firstnames # and includes all the author names (or at least more of them before using "et al." # eg., https://scholar.google.com/scholar?q=info:J2Uvx00ui50J:scholar.google.com/&output=cite&scirp=1&hl=en for r in result_row_info: self.get_page( 'https://scholar.google.com/scholar?q=info:%s:scholar.google.com/' '&output=cite&scirp=1&hl=en' % r['citation_id']) # the third row in the table contains the Chicago-style citation citation = self.selenium_driver.find_elements_by_css_selector( 'td')[2].text year = get_year(citation) if not year: continue # look for the first period that is not part of a middle initial match = re.search(r"\w{2}\. ", citation) if not match: # otherwise, just take the first period as in: # Al-Najjar, Nabil I. "A bayesian framework for precautionary policies." (2013). match = re.search(r"\. ", citation) authors = citation[:match.end()] # venue is in italics try: venue = self.selenium_driver.find_elements_by_css_selector('td')[2]\ .find_element_by_css_selector('i').text except NoSuchElementException: # this is probably a working paper continue match = re.findall( r"\"(.*)\"", citation) # article titles are inside quotes if len(match) == 0: # this is a book, which we don't record continue title = match[0] papers.append( Paper(title=title, authors=authors, venue=venue, year=year, scholar_citations=r['scholar_citations'], wos_citations=r['wos_citations'], id=r['citation_id'])) return papers