def scrape_professor(school_name, faculty_url, extracts_title, extracts_name, extracts_cv_url = None, extracts_personal_url = None, extracts_gscholar_url = None, extracts_papers = None): """ :return: a Professor object or None if it's not a tenure track faculty """ tree = get_tree(faculty_url) if tree is None: return None job_title = strip_whitespace(extracts_title(tree)) if job_title is None: print("WARNING: job title not found on "+faculty_url) return None if not title_is_tenure_track(job_title): return None name = extracts_name(tree) cv_link = None if extracts_cv_url is None else extracts_cv_url(faculty_url, tree) personal_url = None if extracts_personal_url is None else extracts_personal_url(faculty_url, tree) google_scholar_url = None if extracts_gscholar_url is None else extracts_gscholar_url(faculty_url, tree) prof = Professor(name=name, title=job_title, cv_url=cv_link, school=school_name, faculty_directory_url=faculty_url, personal_url=personal_url, google_scholar_url=google_scholar_url) if extracts_papers is not None: paper_list_url, papers = extracts_papers(faculty_url, tree) # save paper list to disk if paper_list_url and papers and len(papers) > 0: prof.paper_list_url = paper_list_url save_paper_list('paper_list', prof, papers) return prof
def get_papers(faculty_url, tree): name = get_name(tree) # download faculty directory, if not already downloaded global library_directory_tree if not library_directory_tree: library_directory_tree = get_tree(library_directory_url) # iterate through faculty names, looking for the best match anchors = css_select(library_directory_tree, 'table.table-striped a') closest_match = min( anchors, key=lambda x: editdistance.eval(name, strip_whitespace(x.text))) # require that closest match be pretty close to accept it if editdistance.eval( name, closest_match.text ) > 3: # 3 characters would allow for a missing initial and period return None, None # download bibliography page bib_url = closest_match.get('href') bib_tree = get_tree(bib_url) # find the "Published Works" section for heading in css_select(bib_tree, 'div.rich-text h2'): if 'Published' in heading.text: # keep collecting the publication divs until we get to the next <H2> papers = [] next = heading.next_element while next: if next.name == 'p': citation = strip_whitespace(next.text) # drop trailing link citation = citation.split('http://')[0] if len(citation) > 0: papers.append(citation) elif next.name == 'h2': break next = next.next_element return bib_url, papers return None, None
def get_faculty_urls( directory_url, main_directory_tree ): # ignore the main directory because it's paginated with ajax faculty_urls = [] for dept in [ 'accounting', 'economics', 'finance', 'marketing', 'operations-information-technology', 'political-economy' ]: dept_tree = get_tree( 'https://www.gsb.stanford.edu/faculty-research/faculty/academic-areas/' + dept) # the first section is for Faculty, the second is for lecturers faculty_div = css_select( dept_tree, 'div.pane-faculty-filters-faculty-by-criteria')[0] for a in css_select( faculty_div, 'div.view-id-faculty_filters div.views-field-title a'): faculty_urls.append('https://www.gsb.stanford.edu' + a.get('href')) return faculty_urls
def get_papers(url, tree): # find the link to "More publications more_pubs_url = HrefSelector('a.right-arrow', 'More Publications')(url, tree) if more_pubs_url is not None: papers = [] p_tree = get_tree(more_pubs_url) for article in css_select(p_tree, 'article.publication--teaser'): if 'Article' in Selector('div.publication--teaser-type')(article): p_title = Selector('h2 a')(article) p_year = Selector('div.publication--teaser-year')(article) p_authors = Selector('div.publication--teaser-authors')( article) p_journal = Selector('div.publication--teaser-journal')( article) papers.append('%s. "%s." %s (%s).' % (p_authors, p_title, p_journal, p_year)) return more_pubs_url, papers return None, None
def get_papers(url, tree): # add "&facInfo=pub" to the faculty url to get the url for the publications tab pub_list_url = url + "&facInfo=pub" wait() pub_tree = get_tree(pub_list_url) # find the bulleted list for publications for heading in css_select(pub_tree, '.tab-content h3'): if 'Articles' in heading.text: # look for the first <OL> under the publications header next = heading while next is not None: if next.name == 'ol': return pub_list_url, [ strip_whitespace( li.text.replace('View Details', '').replace('Citation:', '')) for li in css_select(next, 'div.citation') ] next = next.next_element return None, None
def scrape_professors(school_name: str, directory_url: str, extracts_faculty_urls: Callable[[str, Tag], List[str]], extracts_title: Callable[[Tag], str], extracts_name: Callable[[Tag], str], extracts_cv_url: Optional[Callable[[str, Tag], str]] = None, extracts_personal_url: Optional[Callable[[str, Tag], str]] = None, extracts_gscholar_url: Optional[Callable[[str, Tag], str]] = None, extracts_papers: Optional[Callable[[str, Tag], List[str]]] = None) -> List[Professor]: """ As a side-effect, this function also writes the lists of publications to disk.""" profs = [] # get the faculty index page tree = get_tree(directory_url) for faculty_url in extracts_faculty_urls(directory_url, tree): sleep(2) print("scraping " + faculty_url) p = scrape_professor(school_name, faculty_url, extracts_title, extracts_name, extracts_cv_url, extracts_personal_url, extracts_gscholar_url, extracts_papers) if p is not None: print(p) profs.append(p) return profs
def get_photo_url(prof): img_tags = css_select(get_tree(prof.faculty_directory_url), "img.profileImg") if len(img_tags) == 0: return None return "https://www.kellogg.northwestern.edu" + img_tags[0].get("src")