コード例 #1
0
ファイル: upenn.py プロジェクト: starzia/bibliometrics
def scrape_upenn():
    return scrape_professors(school_name='UPenn',
                             directory_url='https://www.wharton.upenn.edu/faculty-directory/',
                             extracts_faculty_urls=get_faculty_urls,
                             extracts_title=Selector('ul.wfp-header-titles li:nth-of-type(1)'),
                             extracts_name=Selector('div.wfp-header h1'),
                             extracts_cv_url=HrefSelector('div.wfp-header-research a', 'CV'),
                             extracts_personal_url=HrefSelector('div.wfp-header-research a', 'Personal Website'),
                             extracts_gscholar_url=HrefSelector('div.wfp-header-research a', 'Google Scholar'),
                             extracts_papers=get_papers)
コード例 #2
0
def scrape_berkeley():
    return scrape_professors(
        school_name='Berkeley',
        directory_url='http://facultybio.haas.berkeley.edu/faculty-photo/',
        extracts_faculty_urls=HrefListSelector('div.faculty-block p a[href]'),
        extracts_title=get_title,
        extracts_name=Selector('td p span strong'),
        extracts_cv_url=HrefSelector('td p a', 'Curriculum Vitae'),
        extracts_personal_url=HrefSelector('td p a', 'http'),
        extracts_gscholar_url=None,
        extracts_papers=get_papers)
コード例 #3
0
def scrape_uchicago():
    return scrape_professors(
        school_name="Chicago",
        directory_url='https://www.chicagobooth.edu/faculty/directory',
        extracts_faculty_urls=HrefListSelector('div.faculty-listing-name a'),
        extracts_title=Selector('div.faculty-bio-info h2'),
        extracts_name=get_name,
        extracts_cv_url=HrefSelector('ul.resource-list a', 'Curriculum Vitae'),
        extracts_personal_url=HrefSelector('p.faculty-link-website a',
                                           'Personal Website'),
        extracts_papers=get_papers)
コード例 #4
0
def scrape_harvard():
    return scrape_professors(
        school_name="Harvard",
        directory_url='http://www.hbs.edu/faculty/Pages/browse.aspx',
        extracts_faculty_urls=HrefListSelector('div.faculty-item a'),
        extracts_title=Selector('p.faculty-title'),
        extracts_name=Selector('h1.author'),
        extracts_cv_url=HrefSelector('div.faculty-navigation div.links a',
                                     'Curriculum Vitae', 'CV'),
        extracts_personal_url=HrefSelector(
            'div.faculty-navigation div.links a', 'Personal Website',
            'Home Page'),
        extracts_papers=get_papers)
コード例 #5
0
def scrape_stanford():
    return scrape_professors(
        school_name="Stanford",
        directory_url='https://www.gsb.stanford.edu/faculty-research/faculty',
        extracts_faculty_urls=get_faculty_urls,
        extracts_title=Selector('div.field-name-field-title-appointment'),
        extracts_name=get_name,
        extracts_cv_url=HrefSelector(
            'div.field-name-field-file-single-public a', 'CV'),
        extracts_personal_url=HrefSelector(
            'div.field-name-field-link-website a', 'Personal Website'),
        extracts_gscholar_url=HrefSelector(
            'div.field-name-field-file-single-public a', 'Google Scholar'),
        extracts_papers=get_papers)
コード例 #6
0
ファイル: columbia.py プロジェクト: starzia/bibliometrics
def scrape_columbia():
    return scrape_professors(
        school_name='Columbia',
        directory_url=
        'http://www8.gsb.columbia.edu/faculty-research/faculty-directory?full_time=y&division=All&op=Search',
        extracts_faculty_urls=HrefListSelector('div.name a'),
        extracts_name=Selector('h1.primary-heading'),
        extracts_title=Selector('span.affiliation-title'),
        # for CV and personal website, see http://www8.gsb.columbia.edu/cbs-directory/detail/ea1
        extracts_cv_url=HrefSelector('div#contact_info a', 'Curriculum Vitae'),
        extracts_personal_url=HrefSelector('div#contact_info a',
                                           'Personal Website'),
        extracts_gscholar_url=None,
        extracts_papers=get_papers)
コード例 #7
0
def scrape_mit():
    return scrape_professors(
        school_name="MIT",
        directory_url=
        'http://mitsloan.mit.edu/faculty-and-research/faculty-directory/',
        extracts_faculty_urls=HrefListSelector('div.person-result a'),
        extracts_title=get_title,
        extracts_name=Selector('div.innerwrapper h3:nth-of-type(1)'),
        extracts_cv_url=None,
        extracts_personal_url=HrefSelector('aside.faculty-side a',
                                           'Personal Website'),
        extracts_gscholar_url=HrefSelector('aside.faculty-side a',
                                           'Google Scholar'),
        extracts_papers=get_papers)
コード例 #8
0
ファイル: yale.py プロジェクト: starzia/bibliometrics
def get_title(tree):
    # we hook in here to keep track of econ professors having more-detailed econ dept pages
    econ_page = HrefSelector('a.lg-arrow-blue',
                             'Department of Economics website')(
                                 'http://localhost', tree)
    if econ_page is not None:
        econ_faculty_urls.append(econ_page)
        # return an empty job title so that the prof will be dropped by the first scrape
        return ''
    return Selector('h2.sub-title')(tree)
コード例 #9
0
ファイル: kellogg.py プロジェクト: starzia/bibliometrics
def scrape_kellogg():
    return scrape_professors(
        school_name="Northwestern",
        directory_url=
        'http://www.kellogg.northwestern.edu/faculty/advanced_search.aspx',
        extracts_faculty_urls=get_kellogg_faculty_urls,
        extracts_title=Selector('span#lblTitle'),
        extracts_name=Selector('span#lblName'),
        extracts_cv_url=HrefSelector('div#sideNav3 a', 'Download Vita'),
        extracts_papers=get_papers)
コード例 #10
0
ファイル: yale.py プロジェクト: starzia/bibliometrics
def scrape_yale():
    # We do two passes because Yale's econ dept has its own set of pages with a different format.
    # Yale's econ profs have skeleton profiles in their school directory and more detailed ones in the dept directory.
    # eg., http://som.yale.edu/dirk-bergemann
    #  and http://economics.yale.edu/people/dirk-bergemann
    # UPDATE: it turns out that we don't want to include these econ profs, so we set them as hidden below.

    # as a side-effect, this scrape will populate the econ_faculty_to_urls dictionary
    profs = scrape_professors(
        school_name='Yale',
        directory_url='http://som.yale.edu/faculty-research/faculty-directory',
        extracts_faculty_urls=HrefListSelector('h4.faculty--teaser-name a'),
        extracts_name=Selector('h1.title'),
        extracts_title=get_title,
        # for CV and GS, see: http://som.yale.edu/victoria-l-brescoll
        extracts_cv_url=HrefSelector('ul.faculty--info-list li.url a', 'CV'),
        # for website, see: http://som.yale.edu/nicholas-c-barberis
        extracts_personal_url=HrefSelector('ul.faculty--info-list li.url a',
                                           'Website'),
        extracts_gscholar_url=HrefSelector('ul.faculty--info-list li.url a',
                                           'Google Scholar'),
        extracts_papers=get_papers)
    # Now scrape the econ profs from the econ dept website
    econ_profs = scrape_professors(
        school_name='Yale',
        directory_url=
        'http://som.yale.edu/faculty-research/faculty-directory',  # not used
        extracts_faculty_urls=lambda url, tree: econ_faculty_urls,
        extracts_name=Selector('h1.title'),
        extracts_title=Selector('div.group-right div.field-item'),
        extracts_cv_url=HrefSelector('div.group-right div.field-item a', 'CV'),
        extracts_personal_url=HrefSelector('div.group-right div.field-item a',
                                           'Website'))
    # Hide all the econ profs.
    for p in econ_profs:
        p.hidden = True
    return profs + econ_profs
コード例 #11
0
ファイル: yale.py プロジェクト: starzia/bibliometrics
def get_papers(url, tree):
    # find the link to "More publications
    more_pubs_url = HrefSelector('a.right-arrow', 'More Publications')(url,
                                                                       tree)
    if more_pubs_url is not None:
        papers = []
        p_tree = get_tree(more_pubs_url)
        for article in css_select(p_tree, 'article.publication--teaser'):
            if 'Article' in Selector('div.publication--teaser-type')(article):
                p_title = Selector('h2 a')(article)
                p_year = Selector('div.publication--teaser-year')(article)
                p_authors = Selector('div.publication--teaser-authors')(
                    article)
                p_journal = Selector('div.publication--teaser-journal')(
                    article)
                papers.append('%s. "%s." %s (%s).' %
                              (p_authors, p_title, p_journal, p_year))
        return more_pubs_url, papers
    return None, None