Ejemplo n.º 1
0
def scrape_upenn():
    return scrape_professors(school_name='UPenn',
                             directory_url='https://www.wharton.upenn.edu/faculty-directory/',
                             extracts_faculty_urls=get_faculty_urls,
                             extracts_title=Selector('ul.wfp-header-titles li:nth-of-type(1)'),
                             extracts_name=Selector('div.wfp-header h1'),
                             extracts_cv_url=HrefSelector('div.wfp-header-research a', 'CV'),
                             extracts_personal_url=HrefSelector('div.wfp-header-research a', 'Personal Website'),
                             extracts_gscholar_url=HrefSelector('div.wfp-header-research a', 'Google Scholar'),
                             extracts_papers=get_papers)
Ejemplo n.º 2
0
def scrape_kellogg():
    return scrape_professors(
        school_name="Northwestern",
        directory_url=
        'http://www.kellogg.northwestern.edu/faculty/advanced_search.aspx',
        extracts_faculty_urls=get_kellogg_faculty_urls,
        extracts_title=Selector('span#lblTitle'),
        extracts_name=Selector('span#lblName'),
        extracts_cv_url=HrefSelector('div#sideNav3 a', 'Download Vita'),
        extracts_papers=get_papers)
Ejemplo n.º 3
0
def scrape_berkeley():
    return scrape_professors(
        school_name='Berkeley',
        directory_url='http://facultybio.haas.berkeley.edu/faculty-photo/',
        extracts_faculty_urls=HrefListSelector('div.faculty-block p a[href]'),
        extracts_title=get_title,
        extracts_name=Selector('td p span strong'),
        extracts_cv_url=HrefSelector('td p a', 'Curriculum Vitae'),
        extracts_personal_url=HrefSelector('td p a', 'http'),
        extracts_gscholar_url=None,
        extracts_papers=get_papers)
Ejemplo n.º 4
0
def scrape_uchicago():
    return scrape_professors(
        school_name="Chicago",
        directory_url='https://www.chicagobooth.edu/faculty/directory',
        extracts_faculty_urls=HrefListSelector('div.faculty-listing-name a'),
        extracts_title=Selector('div.faculty-bio-info h2'),
        extracts_name=get_name,
        extracts_cv_url=HrefSelector('ul.resource-list a', 'Curriculum Vitae'),
        extracts_personal_url=HrefSelector('p.faculty-link-website a',
                                           'Personal Website'),
        extracts_papers=get_papers)
Ejemplo n.º 5
0
def scrape_harvard():
    return scrape_professors(
        school_name="Harvard",
        directory_url='http://www.hbs.edu/faculty/Pages/browse.aspx',
        extracts_faculty_urls=HrefListSelector('div.faculty-item a'),
        extracts_title=Selector('p.faculty-title'),
        extracts_name=Selector('h1.author'),
        extracts_cv_url=HrefSelector('div.faculty-navigation div.links a',
                                     'Curriculum Vitae', 'CV'),
        extracts_personal_url=HrefSelector(
            'div.faculty-navigation div.links a', 'Personal Website',
            'Home Page'),
        extracts_papers=get_papers)
Ejemplo n.º 6
0
def scrape_stanford():
    return scrape_professors(
        school_name="Stanford",
        directory_url='https://www.gsb.stanford.edu/faculty-research/faculty',
        extracts_faculty_urls=get_faculty_urls,
        extracts_title=Selector('div.field-name-field-title-appointment'),
        extracts_name=get_name,
        extracts_cv_url=HrefSelector(
            'div.field-name-field-file-single-public a', 'CV'),
        extracts_personal_url=HrefSelector(
            'div.field-name-field-link-website a', 'Personal Website'),
        extracts_gscholar_url=HrefSelector(
            'div.field-name-field-file-single-public a', 'Google Scholar'),
        extracts_papers=get_papers)
Ejemplo n.º 7
0
def scrape_columbia():
    return scrape_professors(
        school_name='Columbia',
        directory_url=
        'http://www8.gsb.columbia.edu/faculty-research/faculty-directory?full_time=y&division=All&op=Search',
        extracts_faculty_urls=HrefListSelector('div.name a'),
        extracts_name=Selector('h1.primary-heading'),
        extracts_title=Selector('span.affiliation-title'),
        # for CV and personal website, see http://www8.gsb.columbia.edu/cbs-directory/detail/ea1
        extracts_cv_url=HrefSelector('div#contact_info a', 'Curriculum Vitae'),
        extracts_personal_url=HrefSelector('div#contact_info a',
                                           'Personal Website'),
        extracts_gscholar_url=None,
        extracts_papers=get_papers)
Ejemplo n.º 8
0
def scrape_mit():
    return scrape_professors(
        school_name="MIT",
        directory_url=
        'http://mitsloan.mit.edu/faculty-and-research/faculty-directory/',
        extracts_faculty_urls=HrefListSelector('div.person-result a'),
        extracts_title=get_title,
        extracts_name=Selector('div.innerwrapper h3:nth-of-type(1)'),
        extracts_cv_url=None,
        extracts_personal_url=HrefSelector('aside.faculty-side a',
                                           'Personal Website'),
        extracts_gscholar_url=HrefSelector('aside.faculty-side a',
                                           'Google Scholar'),
        extracts_papers=get_papers)
Ejemplo n.º 9
0
def scrape_yale():
    # We do two passes because Yale's econ dept has its own set of pages with a different format.
    # Yale's econ profs have skeleton profiles in their school directory and more detailed ones in the dept directory.
    # eg., http://som.yale.edu/dirk-bergemann
    #  and http://economics.yale.edu/people/dirk-bergemann
    # UPDATE: it turns out that we don't want to include these econ profs, so we set them as hidden below.

    # as a side-effect, this scrape will populate the econ_faculty_to_urls dictionary
    profs = scrape_professors(
        school_name='Yale',
        directory_url='http://som.yale.edu/faculty-research/faculty-directory',
        extracts_faculty_urls=HrefListSelector('h4.faculty--teaser-name a'),
        extracts_name=Selector('h1.title'),
        extracts_title=get_title,
        # for CV and GS, see: http://som.yale.edu/victoria-l-brescoll
        extracts_cv_url=HrefSelector('ul.faculty--info-list li.url a', 'CV'),
        # for website, see: http://som.yale.edu/nicholas-c-barberis
        extracts_personal_url=HrefSelector('ul.faculty--info-list li.url a',
                                           'Website'),
        extracts_gscholar_url=HrefSelector('ul.faculty--info-list li.url a',
                                           'Google Scholar'),
        extracts_papers=get_papers)
    # Now scrape the econ profs from the econ dept website
    econ_profs = scrape_professors(
        school_name='Yale',
        directory_url=
        'http://som.yale.edu/faculty-research/faculty-directory',  # not used
        extracts_faculty_urls=lambda url, tree: econ_faculty_urls,
        extracts_name=Selector('h1.title'),
        extracts_title=Selector('div.group-right div.field-item'),
        extracts_cv_url=HrefSelector('div.group-right div.field-item a', 'CV'),
        extracts_personal_url=HrefSelector('div.group-right div.field-item a',
                                           'Website'))
    # Hide all the econ profs.
    for p in econ_profs:
        p.hidden = True
    return profs + econ_profs