def scrape_professor(school_name,
                     faculty_url,
                     extracts_title,
                     extracts_name,
                     extracts_cv_url = None,
                     extracts_personal_url = None,
                     extracts_gscholar_url = None,
                     extracts_papers = None):
    """ :return: a Professor object or None if it's not a tenure track faculty """
    tree = get_tree(faculty_url)
    if tree is None:
        return None
    job_title = strip_whitespace(extracts_title(tree))
    if job_title is None:
        print("WARNING: job title not found on "+faculty_url)
        return None
    if not title_is_tenure_track(job_title):
        return None
    name = extracts_name(tree)
    cv_link = None if extracts_cv_url is None else extracts_cv_url(faculty_url, tree)
    personal_url = None if extracts_personal_url is None else extracts_personal_url(faculty_url, tree)
    google_scholar_url = None if extracts_gscholar_url is None else extracts_gscholar_url(faculty_url, tree)
    prof = Professor(name=name, title=job_title, cv_url=cv_link, school=school_name,
                     faculty_directory_url=faculty_url, personal_url=personal_url, google_scholar_url=google_scholar_url)
    if extracts_papers is not None:
        paper_list_url, papers = extracts_papers(faculty_url, tree)
        # save paper list to disk
        if paper_list_url and papers and len(papers) > 0:
            prof.paper_list_url = paper_list_url
            save_paper_list('paper_list', prof, papers)
    return prof
Example #2
0
def get_papers(faculty_url, tree):
    name = get_name(tree)
    # download faculty directory, if not already downloaded
    global library_directory_tree
    if not library_directory_tree:
        library_directory_tree = get_tree(library_directory_url)
    # iterate through faculty names, looking for the best match
    anchors = css_select(library_directory_tree, 'table.table-striped a')
    closest_match = min(
        anchors,
        key=lambda x: editdistance.eval(name, strip_whitespace(x.text)))
    # require that closest match be pretty close to accept it
    if editdistance.eval(
            name, closest_match.text
    ) > 3:  # 3 characters would allow for a missing initial and period
        return None, None
    # download bibliography page
    bib_url = closest_match.get('href')
    bib_tree = get_tree(bib_url)
    # find the "Published Works" section
    for heading in css_select(bib_tree, 'div.rich-text h2'):
        if 'Published' in heading.text:
            # keep collecting the publication divs until we get to the next <H2>
            papers = []
            next = heading.next_element
            while next:
                if next.name == 'p':
                    citation = strip_whitespace(next.text)
                    # drop trailing link
                    citation = citation.split('http://')[0]
                    if len(citation) > 0:
                        papers.append(citation)
                elif next.name == 'h2':
                    break
                next = next.next_element
            return bib_url, papers
    return None, None
Example #3
0
def get_faculty_urls(
    directory_url, main_directory_tree
):  # ignore the main directory because it's paginated with ajax
    faculty_urls = []
    for dept in [
            'accounting', 'economics', 'finance', 'marketing',
            'operations-information-technology', 'political-economy'
    ]:
        dept_tree = get_tree(
            'https://www.gsb.stanford.edu/faculty-research/faculty/academic-areas/'
            + dept)
        # the first section is for Faculty, the second is for lecturers
        faculty_div = css_select(
            dept_tree, 'div.pane-faculty-filters-faculty-by-criteria')[0]
        for a in css_select(
                faculty_div,
                'div.view-id-faculty_filters div.views-field-title a'):
            faculty_urls.append('https://www.gsb.stanford.edu' + a.get('href'))
    return faculty_urls
Example #4
0
def get_papers(url, tree):
    # find the link to "More publications
    more_pubs_url = HrefSelector('a.right-arrow', 'More Publications')(url,
                                                                       tree)
    if more_pubs_url is not None:
        papers = []
        p_tree = get_tree(more_pubs_url)
        for article in css_select(p_tree, 'article.publication--teaser'):
            if 'Article' in Selector('div.publication--teaser-type')(article):
                p_title = Selector('h2 a')(article)
                p_year = Selector('div.publication--teaser-year')(article)
                p_authors = Selector('div.publication--teaser-authors')(
                    article)
                p_journal = Selector('div.publication--teaser-journal')(
                    article)
                papers.append('%s. "%s." %s (%s).' %
                              (p_authors, p_title, p_journal, p_year))
        return more_pubs_url, papers
    return None, None
Example #5
0
def get_papers(url, tree):
    # add "&facInfo=pub" to the faculty url to get the url for the publications tab
    pub_list_url = url + "&facInfo=pub"
    wait()
    pub_tree = get_tree(pub_list_url)
    # find the bulleted list for publications
    for heading in css_select(pub_tree, '.tab-content h3'):
        if 'Articles' in heading.text:
            # look for the first <OL> under the publications header
            next = heading
            while next is not None:
                if next.name == 'ol':
                    return pub_list_url, [
                        strip_whitespace(
                            li.text.replace('View Details',
                                            '').replace('Citation:', ''))
                        for li in css_select(next, 'div.citation')
                    ]
                next = next.next_element
    return None, None
def scrape_professors(school_name: str,
                      directory_url: str,
                      extracts_faculty_urls: Callable[[str, Tag], List[str]],
                      extracts_title: Callable[[Tag], str],
                      extracts_name: Callable[[Tag], str],
                      extracts_cv_url: Optional[Callable[[str, Tag], str]] = None,
                      extracts_personal_url: Optional[Callable[[str, Tag], str]] = None,
                      extracts_gscholar_url: Optional[Callable[[str, Tag], str]] = None,
                      extracts_papers: Optional[Callable[[str, Tag], List[str]]] = None) -> List[Professor]:
    """ As a side-effect, this function also writes the lists of publications to disk."""
    profs = []
    # get the faculty index page
    tree = get_tree(directory_url)
    for faculty_url in extracts_faculty_urls(directory_url, tree):
        sleep(2)
        print("scraping " + faculty_url)
        p = scrape_professor(school_name, faculty_url,
                             extracts_title, extracts_name, extracts_cv_url,
                             extracts_personal_url, extracts_gscholar_url, extracts_papers)
        if p is not None:
            print(p)
            profs.append(p)
    return profs
Example #7
0
def get_photo_url(prof):
    img_tags = css_select(get_tree(prof.faculty_directory_url),
                          "img.profileImg")
    if len(img_tags) == 0:
        return None
    return "https://www.kellogg.northwestern.edu" + img_tags[0].get("src")