def scrape_upenn(): return scrape_professors(school_name='UPenn', directory_url='https://www.wharton.upenn.edu/faculty-directory/', extracts_faculty_urls=get_faculty_urls, extracts_title=Selector('ul.wfp-header-titles li:nth-of-type(1)'), extracts_name=Selector('div.wfp-header h1'), extracts_cv_url=HrefSelector('div.wfp-header-research a', 'CV'), extracts_personal_url=HrefSelector('div.wfp-header-research a', 'Personal Website'), extracts_gscholar_url=HrefSelector('div.wfp-header-research a', 'Google Scholar'), extracts_papers=get_papers)
def scrape_berkeley(): return scrape_professors( school_name='Berkeley', directory_url='http://facultybio.haas.berkeley.edu/faculty-photo/', extracts_faculty_urls=HrefListSelector('div.faculty-block p a[href]'), extracts_title=get_title, extracts_name=Selector('td p span strong'), extracts_cv_url=HrefSelector('td p a', 'Curriculum Vitae'), extracts_personal_url=HrefSelector('td p a', 'http'), extracts_gscholar_url=None, extracts_papers=get_papers)
def scrape_uchicago(): return scrape_professors( school_name="Chicago", directory_url='https://www.chicagobooth.edu/faculty/directory', extracts_faculty_urls=HrefListSelector('div.faculty-listing-name a'), extracts_title=Selector('div.faculty-bio-info h2'), extracts_name=get_name, extracts_cv_url=HrefSelector('ul.resource-list a', 'Curriculum Vitae'), extracts_personal_url=HrefSelector('p.faculty-link-website a', 'Personal Website'), extracts_papers=get_papers)
def scrape_harvard(): return scrape_professors( school_name="Harvard", directory_url='http://www.hbs.edu/faculty/Pages/browse.aspx', extracts_faculty_urls=HrefListSelector('div.faculty-item a'), extracts_title=Selector('p.faculty-title'), extracts_name=Selector('h1.author'), extracts_cv_url=HrefSelector('div.faculty-navigation div.links a', 'Curriculum Vitae', 'CV'), extracts_personal_url=HrefSelector( 'div.faculty-navigation div.links a', 'Personal Website', 'Home Page'), extracts_papers=get_papers)
def scrape_stanford(): return scrape_professors( school_name="Stanford", directory_url='https://www.gsb.stanford.edu/faculty-research/faculty', extracts_faculty_urls=get_faculty_urls, extracts_title=Selector('div.field-name-field-title-appointment'), extracts_name=get_name, extracts_cv_url=HrefSelector( 'div.field-name-field-file-single-public a', 'CV'), extracts_personal_url=HrefSelector( 'div.field-name-field-link-website a', 'Personal Website'), extracts_gscholar_url=HrefSelector( 'div.field-name-field-file-single-public a', 'Google Scholar'), extracts_papers=get_papers)
def scrape_columbia(): return scrape_professors( school_name='Columbia', directory_url= 'http://www8.gsb.columbia.edu/faculty-research/faculty-directory?full_time=y&division=All&op=Search', extracts_faculty_urls=HrefListSelector('div.name a'), extracts_name=Selector('h1.primary-heading'), extracts_title=Selector('span.affiliation-title'), # for CV and personal website, see http://www8.gsb.columbia.edu/cbs-directory/detail/ea1 extracts_cv_url=HrefSelector('div#contact_info a', 'Curriculum Vitae'), extracts_personal_url=HrefSelector('div#contact_info a', 'Personal Website'), extracts_gscholar_url=None, extracts_papers=get_papers)
def scrape_mit(): return scrape_professors( school_name="MIT", directory_url= 'http://mitsloan.mit.edu/faculty-and-research/faculty-directory/', extracts_faculty_urls=HrefListSelector('div.person-result a'), extracts_title=get_title, extracts_name=Selector('div.innerwrapper h3:nth-of-type(1)'), extracts_cv_url=None, extracts_personal_url=HrefSelector('aside.faculty-side a', 'Personal Website'), extracts_gscholar_url=HrefSelector('aside.faculty-side a', 'Google Scholar'), extracts_papers=get_papers)
def get_title(tree): # we hook in here to keep track of econ professors having more-detailed econ dept pages econ_page = HrefSelector('a.lg-arrow-blue', 'Department of Economics website')( 'http://localhost', tree) if econ_page is not None: econ_faculty_urls.append(econ_page) # return an empty job title so that the prof will be dropped by the first scrape return '' return Selector('h2.sub-title')(tree)
def scrape_kellogg(): return scrape_professors( school_name="Northwestern", directory_url= 'http://www.kellogg.northwestern.edu/faculty/advanced_search.aspx', extracts_faculty_urls=get_kellogg_faculty_urls, extracts_title=Selector('span#lblTitle'), extracts_name=Selector('span#lblName'), extracts_cv_url=HrefSelector('div#sideNav3 a', 'Download Vita'), extracts_papers=get_papers)
def scrape_yale(): # We do two passes because Yale's econ dept has its own set of pages with a different format. # Yale's econ profs have skeleton profiles in their school directory and more detailed ones in the dept directory. # eg., http://som.yale.edu/dirk-bergemann # and http://economics.yale.edu/people/dirk-bergemann # UPDATE: it turns out that we don't want to include these econ profs, so we set them as hidden below. # as a side-effect, this scrape will populate the econ_faculty_to_urls dictionary profs = scrape_professors( school_name='Yale', directory_url='http://som.yale.edu/faculty-research/faculty-directory', extracts_faculty_urls=HrefListSelector('h4.faculty--teaser-name a'), extracts_name=Selector('h1.title'), extracts_title=get_title, # for CV and GS, see: http://som.yale.edu/victoria-l-brescoll extracts_cv_url=HrefSelector('ul.faculty--info-list li.url a', 'CV'), # for website, see: http://som.yale.edu/nicholas-c-barberis extracts_personal_url=HrefSelector('ul.faculty--info-list li.url a', 'Website'), extracts_gscholar_url=HrefSelector('ul.faculty--info-list li.url a', 'Google Scholar'), extracts_papers=get_papers) # Now scrape the econ profs from the econ dept website econ_profs = scrape_professors( school_name='Yale', directory_url= 'http://som.yale.edu/faculty-research/faculty-directory', # not used extracts_faculty_urls=lambda url, tree: econ_faculty_urls, extracts_name=Selector('h1.title'), extracts_title=Selector('div.group-right div.field-item'), extracts_cv_url=HrefSelector('div.group-right div.field-item a', 'CV'), extracts_personal_url=HrefSelector('div.group-right div.field-item a', 'Website')) # Hide all the econ profs. for p in econ_profs: p.hidden = True return profs + econ_profs
def get_papers(url, tree): # find the link to "More publications more_pubs_url = HrefSelector('a.right-arrow', 'More Publications')(url, tree) if more_pubs_url is not None: papers = [] p_tree = get_tree(more_pubs_url) for article in css_select(p_tree, 'article.publication--teaser'): if 'Article' in Selector('div.publication--teaser-type')(article): p_title = Selector('h2 a')(article) p_year = Selector('div.publication--teaser-year')(article) p_authors = Selector('div.publication--teaser-authors')( article) p_journal = Selector('div.publication--teaser-journal')( article) papers.append('%s. "%s." %s (%s).' % (p_authors, p_title, p_journal, p_year)) return more_pubs_url, papers return None, None