def scrape_profile(self, author_url) -> List[Paper]: self.get_page(author_url) # click "show more" button until it disappears while True: try: button = self.selenium_driver.find_element_by_css_selector( 'button#gsc_bpf_more:enabled') if button: button.click() self.wait_for_captchas() wait() else: # if no enabled button found, then we're done break except (NoSuchElementException, ElementNotVisibleException, InvalidElementStateException): break # load the page in Beautiful Soup for easier parsing tree = tree_from_string(self.selenium_driver.page_source) # scrape the list of papers papers = [] for row in css_select(tree, 'tr.gsc_a_tr'): title = Selector('td.gsc_a_t a')(row) authors_and_venue = css_select(row, 'div.gs_gray') author = authors_and_venue[0].text venue = authors_and_venue[1].text year = Selector('td.gsc_a_y')(row) citation_count = Selector('td.gsc_a_c a.gsc_a_ac')(row) # look for strikeout (cross-out) over citation count, indicating that it's a dupe if Selector('td.gsc_a_c a.gsc_a_acm')(row): continue papers.append(Paper(title, author, venue, year, citation_count)) return papers
def get_title(tree): # We don't know exactly where the title will be so we keep looking until we find one that matches. for candidate in [ strip_whitespace(c.text) for c in css_select(tree, 'ul#ctl00_content_titles li i') + css_select(tree, 'ul#ctl00_content_areas li') ]: if candidate is not None and is_job_title(candidate): return candidate return None
def get_papers(url, tree): # find the bulleted list for publications for heading in css_select(tree, '#center-col strong'): if 'Publications' in heading.text: # look for the first <UL> under the publications header next = heading while next is not None: if next.name == 'ul': return url, [ strip_whitespace(li.text.replace(' PDF.', '')) for li in css_select(next, 'li') ] next = next.next_element return None, None
def get_papers(faculty_url, tree): for e in css_select(tree, 'div.leftResearch div.entries'): # check that we're in the right section if e.previous_sibling[ 'class'] == 'tabSubheading' and e.previous_sibling.text != 'Articles': break paper_list = [] for c in css_select(e, 'div.entry div.copy'): citation = c.text # some articles have an abstract, which I want to ignore for abstract in css_select(c, 'div'): citation = citation.replace(abstract.text, '') paper_list.append(strip_whitespace(citation)) return faculty_url + '#research', paper_list return None, None
def get_papers(url, tree): # find the list of Journal Articles for heading in css_select(tree, 'div.view-gsb-publications-listing h2.title'): if 'Journal Articles' in heading.text: # look for the first <div class="view-content"> under the Journal Articles header candidate = heading while candidate is not None: if candidate.name == 'div' and 'view-content' in candidate.get( 'class'): return url, [ strip_whitespace(row.get_text()) for row in css_select(candidate, 'div.views-row') ] candidate = candidate.next_element return None, None
def get_faculty_urls( directory_url, main_directory_tree ): # ignore the main directory because it's paginated with ajax faculty_urls = [] for dept in [ 'accounting', 'economics', 'finance', 'marketing', 'operations-information-technology', 'political-economy' ]: dept_tree = get_tree( 'https://www.gsb.stanford.edu/faculty-research/faculty/academic-areas/' + dept) # the first section is for Faculty, the second is for lecturers faculty_div = css_select( dept_tree, 'div.pane-faculty-filters-faculty-by-criteria')[0] for a in css_select( faculty_div, 'div.view-id-faculty_filters div.views-field-title a'): faculty_urls.append('https://www.gsb.stanford.edu' + a.get('href')) return faculty_urls
def get_faculty_urls(directory_url, tree): urls = [] # find the "faculty by name" div, then look under it directory_div = False for div in css_select(tree, 'div.vc_row-fluid'): if directory_div: directory_div = div break for h4 in css_select(div, 'h4'): if 'FACULTY BY NAME' in h4.text: directory_div = True for a in css_select(directory_div, 'div.wpb_wrapper a'): url = a.get('href') if url is not None: if "wharton.upenn.edu" in url: urls.append(url.strip()) else: "WARNING: dropping non-Wharton faculty: "+url return urls
def get_papers(url, tree): # add "&facInfo=pub" to the faculty url to get the url for the publications tab pub_list_url = url + "&facInfo=pub" wait() pub_tree = get_tree(pub_list_url) # find the bulleted list for publications for heading in css_select(pub_tree, '.tab-content h3'): if 'Articles' in heading.text: # look for the first <OL> under the publications header next = heading while next is not None: if next.name == 'ol': return pub_list_url, [ strip_whitespace( li.text.replace('View Details', '').replace('Citation:', '')) for li in css_select(next, 'div.citation') ] next = next.next_element return None, None
def get_kellogg_faculty_urls(directory_url, tree): """Parse a drop-down selection.""" urls = [] for option in css_select( tree, 'select#plcprimarymaincontent_1_selBrowseByName option'): if option.get('value') != '': net_id = option.get('value') urls.append( 'http://www.kellogg.northwestern.edu/Faculty/Faculty_Search_Results.aspx?netid=' + net_id) return urls
def get_papers(url, tree): papers = [] # find the list of divs for journal publications for heading in css_select(tree, 'h3'): if 'Journal articles' in heading.text: # keep collecting the publication divs until we get to the next <H3> next = heading while next is not None: if next.name == 'p': papers.append(strip_whitespace(next.text)) elif next.name == 'div' and 'Awards and Honors' in next.text: break next = next.next_element return url + '#research', papers
def get_papers(faculty_url, tree): name = get_name(tree) # download faculty directory, if not already downloaded global library_directory_tree if not library_directory_tree: library_directory_tree = get_tree(library_directory_url) # iterate through faculty names, looking for the best match anchors = css_select(library_directory_tree, 'table.table-striped a') closest_match = min( anchors, key=lambda x: editdistance.eval(name, strip_whitespace(x.text))) # require that closest match be pretty close to accept it if editdistance.eval( name, closest_match.text ) > 3: # 3 characters would allow for a missing initial and period return None, None # download bibliography page bib_url = closest_match.get('href') bib_tree = get_tree(bib_url) # find the "Published Works" section for heading in css_select(bib_tree, 'div.rich-text h2'): if 'Published' in heading.text: # keep collecting the publication divs until we get to the next <H2> papers = [] next = heading.next_element while next: if next.name == 'p': citation = strip_whitespace(next.text) # drop trailing link citation = citation.split('http://')[0] if len(citation) > 0: papers.append(citation) elif next.name == 'h2': break next = next.next_element return bib_url, papers return None, None
def get_papers(url, tree): # find the link to "More publications more_pubs_url = HrefSelector('a.right-arrow', 'More Publications')(url, tree) if more_pubs_url is not None: papers = [] p_tree = get_tree(more_pubs_url) for article in css_select(p_tree, 'article.publication--teaser'): if 'Article' in Selector('div.publication--teaser-type')(article): p_title = Selector('h2 a')(article) p_year = Selector('div.publication--teaser-year')(article) p_authors = Selector('div.publication--teaser-authors')( article) p_journal = Selector('div.publication--teaser-journal')( article) papers.append('%s. "%s." %s (%s).' % (p_authors, p_title, p_journal, p_year)) return more_pubs_url, papers return None, None
def get_papers(url, tree): # An ajax call returns JSON with publication info, # eg., see https://fnce.wharton.upenn.edu/profile/abel/#research url = css_select(tree, 'link[rel="canonical"]')[0].get('href') if url.endswith('/'): # extract from https://statistics.wharton.upenn.edu/profile/bhaswar/ user_id = url.split('/')[-2] else: # extract from https://www.wharton.upenn.edu/faculty/binsbergen.cfm user_id = url.replace('.cfm', '').split('/')[-1] json = get_json('https://faculty.wharton.upenn.edu/wp-json/wfp/v2/publication/?author=%s&per_page=500&page=1' % user_id) if 'data' not in json: return None, None citations = [] for paper in json['data']: if paper['type'] == 'wfp_pubpubpaper': # published papers only # The 'citation' attribute contains an html-formatted citation. We just convert it to plain text. citations.append(tree_from_string(paper['citation']).get_text()) return url + '#research', citations
def get_title(tag): for e in css_select(tag, 'td p')[0].children: if isinstance(e, NavigableString): e = strip_whitespace(e) if len(e) > 0: return e
def get_photo_url(prof): img_tags = css_select(get_tree(prof.faculty_directory_url), "img.profileImg") if len(img_tags) == 0: return None return "https://www.kellogg.northwestern.edu" + img_tags[0].get("src")
def get_name(tree): return ' '.join([ span.text for span in css_select(tree, 'div.group-wrapper-name span') ])
def scrape_search_results(self, prof: Professor) -> List[Paper]: """In this case, we are saving all articles, even if we are not sure that they match the author. We only search in the past ten years (2007 and later) and only include the first 100 pages of results, and only papers that have at least one citation in Google Scholar (to save us some time).""" # parse each page of results, up to at most 1000 articles (100 pages) papers = [] # for each page of results for start in range(0, 1000, 10): result_row_info = [] # get search results page self.get_page( 'https://scholar.google.com/scholar?start=%d&as_ylo=%s&q=author%%3A"%s"+%s' % (start, STARTING_YEAR, urllib.parse.quote( prof.simple_name()), prof.school)) # We get the GS and WoS citation counts from the search results page # We get the full citation information by virtually clicking the "cite" link for each article tree = tree_from_string(self.selenium_driver.page_source) for row in css_select(tree, 'div.gs_r div.gs_ri'): scholar_citations = None wos_citations = None citation_id = None for link in css_select(row, 'div.gs_fl a'): if 'Cited by' in link.text: scholar_citations = link.text.split(' ')[-1] elif 'Web of Science:' in link.text: wos_citations = link.text.split(': ')[-1] elif 'Related articles' in link.text: citation_id = link.get('href').split(":")[1] # ignore papers with no citations if not scholar_citations: break result_row_info.append({ 'scholar_citations': scholar_citations, 'wos_citations': wos_citations, 'citation_id': citation_id }) # stop when we've gone past the end of results if len(result_row_info) == 0: break # fetch each citation and pick out the Chicago format because it has full firstnames # and includes all the author names (or at least more of them before using "et al." # eg., https://scholar.google.com/scholar?q=info:J2Uvx00ui50J:scholar.google.com/&output=cite&scirp=1&hl=en for r in result_row_info: self.get_page( 'https://scholar.google.com/scholar?q=info:%s:scholar.google.com/' '&output=cite&scirp=1&hl=en' % r['citation_id']) # the third row in the table contains the Chicago-style citation citation = self.selenium_driver.find_elements_by_css_selector( 'td')[2].text year = get_year(citation) if not year: continue # look for the first period that is not part of a middle initial match = re.search(r"\w{2}\. ", citation) if not match: # otherwise, just take the first period as in: # Al-Najjar, Nabil I. "A bayesian framework for precautionary policies." (2013). match = re.search(r"\. ", citation) authors = citation[:match.end()] # venue is in italics try: venue = self.selenium_driver.find_elements_by_css_selector('td')[2]\ .find_element_by_css_selector('i').text except NoSuchElementException: # this is probably a working paper continue match = re.findall( r"\"(.*)\"", citation) # article titles are inside quotes if len(match) == 0: # this is a book, which we don't record continue title = match[0] papers.append( Paper(title=title, authors=authors, venue=venue, year=year, scholar_citations=r['scholar_citations'], wos_citations=r['wos_citations'], id=r['citation_id'])) return papers
def get_papers(url, tree): # find the bulleted list for publications return url, [ strip_whitespace(li.text) for li in css_select(tree, 'div#tabs-1 ul li') ]