コード例 #1
0
 def scrape_profile(self, author_url) -> List[Paper]:
     self.get_page(author_url)
     # click "show more" button until it disappears
     while True:
         try:
             button = self.selenium_driver.find_element_by_css_selector(
                 'button#gsc_bpf_more:enabled')
             if button:
                 button.click()
                 self.wait_for_captchas()
                 wait()
             else:
                 # if no enabled button found, then we're done
                 break
         except (NoSuchElementException, ElementNotVisibleException,
                 InvalidElementStateException):
             break
     # load the page in Beautiful Soup for easier parsing
     tree = tree_from_string(self.selenium_driver.page_source)
     # scrape the list of papers
     papers = []
     for row in css_select(tree, 'tr.gsc_a_tr'):
         title = Selector('td.gsc_a_t a')(row)
         authors_and_venue = css_select(row, 'div.gs_gray')
         author = authors_and_venue[0].text
         venue = authors_and_venue[1].text
         year = Selector('td.gsc_a_y')(row)
         citation_count = Selector('td.gsc_a_c a.gsc_a_ac')(row)
         # look for strikeout (cross-out) over citation count, indicating that it's a dupe
         if Selector('td.gsc_a_c a.gsc_a_acm')(row):
             continue
         papers.append(Paper(title, author, venue, year, citation_count))
     return papers
コード例 #2
0
def get_title(tree):
    # We don't know exactly where the title will be so we keep looking until we find one that matches.
    for candidate in [
            strip_whitespace(c.text)
            for c in css_select(tree, 'ul#ctl00_content_titles li i') +
            css_select(tree, 'ul#ctl00_content_areas li')
    ]:
        if candidate is not None and is_job_title(candidate):
            return candidate
    return None
コード例 #3
0
def get_papers(url, tree):
    # find the bulleted list for publications
    for heading in css_select(tree, '#center-col strong'):
        if 'Publications' in heading.text:
            # look for the first <UL> under the publications header
            next = heading
            while next is not None:
                if next.name == 'ul':
                    return url, [
                        strip_whitespace(li.text.replace(' PDF.', ''))
                        for li in css_select(next, 'li')
                    ]
                next = next.next_element
    return None, None
コード例 #4
0
ファイル: kellogg.py プロジェクト: starzia/bibliometrics
def get_papers(faculty_url, tree):
    for e in css_select(tree, 'div.leftResearch div.entries'):
        # check that we're in the right section
        if e.previous_sibling[
                'class'] == 'tabSubheading' and e.previous_sibling.text != 'Articles':
            break
        paper_list = []
        for c in css_select(e, 'div.entry div.copy'):
            citation = c.text
            # some articles have an abstract, which I want to ignore
            for abstract in css_select(c, 'div'):
                citation = citation.replace(abstract.text, '')
            paper_list.append(strip_whitespace(citation))
        return faculty_url + '#research', paper_list
    return None, None
コード例 #5
0
def get_papers(url, tree):
    # find the list of Journal Articles
    for heading in css_select(tree,
                              'div.view-gsb-publications-listing h2.title'):
        if 'Journal Articles' in heading.text:
            # look for the first <div class="view-content"> under the Journal Articles header
            candidate = heading
            while candidate is not None:
                if candidate.name == 'div' and 'view-content' in candidate.get(
                        'class'):
                    return url, [
                        strip_whitespace(row.get_text())
                        for row in css_select(candidate, 'div.views-row')
                    ]
                candidate = candidate.next_element
    return None, None
コード例 #6
0
def get_faculty_urls(
    directory_url, main_directory_tree
):  # ignore the main directory because it's paginated with ajax
    faculty_urls = []
    for dept in [
            'accounting', 'economics', 'finance', 'marketing',
            'operations-information-technology', 'political-economy'
    ]:
        dept_tree = get_tree(
            'https://www.gsb.stanford.edu/faculty-research/faculty/academic-areas/'
            + dept)
        # the first section is for Faculty, the second is for lecturers
        faculty_div = css_select(
            dept_tree, 'div.pane-faculty-filters-faculty-by-criteria')[0]
        for a in css_select(
                faculty_div,
                'div.view-id-faculty_filters div.views-field-title a'):
            faculty_urls.append('https://www.gsb.stanford.edu' + a.get('href'))
    return faculty_urls
コード例 #7
0
ファイル: upenn.py プロジェクト: starzia/bibliometrics
def get_faculty_urls(directory_url, tree):
    urls = []
    # find the "faculty by name" div, then look under it
    directory_div = False
    for div in css_select(tree, 'div.vc_row-fluid'):
        if directory_div:
            directory_div = div
            break
        for h4 in css_select(div, 'h4'):
            if 'FACULTY BY NAME' in h4.text:
                directory_div = True
    for a in css_select(directory_div, 'div.wpb_wrapper a'):
        url = a.get('href')
        if url is not None:
            if "wharton.upenn.edu" in url:
                urls.append(url.strip())
            else:
                "WARNING: dropping non-Wharton faculty: "+url
    return urls
コード例 #8
0
def get_papers(url, tree):
    # add "&facInfo=pub" to the faculty url to get the url for the publications tab
    pub_list_url = url + "&facInfo=pub"
    wait()
    pub_tree = get_tree(pub_list_url)
    # find the bulleted list for publications
    for heading in css_select(pub_tree, '.tab-content h3'):
        if 'Articles' in heading.text:
            # look for the first <OL> under the publications header
            next = heading
            while next is not None:
                if next.name == 'ol':
                    return pub_list_url, [
                        strip_whitespace(
                            li.text.replace('View Details',
                                            '').replace('Citation:', ''))
                        for li in css_select(next, 'div.citation')
                    ]
                next = next.next_element
    return None, None
コード例 #9
0
ファイル: kellogg.py プロジェクト: starzia/bibliometrics
def get_kellogg_faculty_urls(directory_url, tree):
    """Parse a drop-down selection."""
    urls = []
    for option in css_select(
            tree, 'select#plcprimarymaincontent_1_selBrowseByName option'):
        if option.get('value') != '':
            net_id = option.get('value')
            urls.append(
                'http://www.kellogg.northwestern.edu/Faculty/Faculty_Search_Results.aspx?netid='
                + net_id)
    return urls
コード例 #10
0
ファイル: columbia.py プロジェクト: starzia/bibliometrics
def get_papers(url, tree):
    papers = []
    # find the list of divs for journal publications
    for heading in css_select(tree, 'h3'):
        if 'Journal articles' in heading.text:
            # keep collecting the publication divs until we get to the next <H3>
            next = heading
            while next is not None:
                if next.name == 'p':
                    papers.append(strip_whitespace(next.text))
                elif next.name == 'div' and 'Awards and Honors' in next.text:
                    break
                next = next.next_element
    return url + '#research', papers
コード例 #11
0
def get_papers(faculty_url, tree):
    name = get_name(tree)
    # download faculty directory, if not already downloaded
    global library_directory_tree
    if not library_directory_tree:
        library_directory_tree = get_tree(library_directory_url)
    # iterate through faculty names, looking for the best match
    anchors = css_select(library_directory_tree, 'table.table-striped a')
    closest_match = min(
        anchors,
        key=lambda x: editdistance.eval(name, strip_whitespace(x.text)))
    # require that closest match be pretty close to accept it
    if editdistance.eval(
            name, closest_match.text
    ) > 3:  # 3 characters would allow for a missing initial and period
        return None, None
    # download bibliography page
    bib_url = closest_match.get('href')
    bib_tree = get_tree(bib_url)
    # find the "Published Works" section
    for heading in css_select(bib_tree, 'div.rich-text h2'):
        if 'Published' in heading.text:
            # keep collecting the publication divs until we get to the next <H2>
            papers = []
            next = heading.next_element
            while next:
                if next.name == 'p':
                    citation = strip_whitespace(next.text)
                    # drop trailing link
                    citation = citation.split('http://')[0]
                    if len(citation) > 0:
                        papers.append(citation)
                elif next.name == 'h2':
                    break
                next = next.next_element
            return bib_url, papers
    return None, None
コード例 #12
0
ファイル: yale.py プロジェクト: starzia/bibliometrics
def get_papers(url, tree):
    # find the link to "More publications
    more_pubs_url = HrefSelector('a.right-arrow', 'More Publications')(url,
                                                                       tree)
    if more_pubs_url is not None:
        papers = []
        p_tree = get_tree(more_pubs_url)
        for article in css_select(p_tree, 'article.publication--teaser'):
            if 'Article' in Selector('div.publication--teaser-type')(article):
                p_title = Selector('h2 a')(article)
                p_year = Selector('div.publication--teaser-year')(article)
                p_authors = Selector('div.publication--teaser-authors')(
                    article)
                p_journal = Selector('div.publication--teaser-journal')(
                    article)
                papers.append('%s. "%s." %s (%s).' %
                              (p_authors, p_title, p_journal, p_year))
        return more_pubs_url, papers
    return None, None
コード例 #13
0
ファイル: upenn.py プロジェクト: starzia/bibliometrics
def get_papers(url, tree):
    # An ajax call returns JSON with publication info,
    # eg., see https://fnce.wharton.upenn.edu/profile/abel/#research
    url = css_select(tree, 'link[rel="canonical"]')[0].get('href')
    if url.endswith('/'):
        # extract from https://statistics.wharton.upenn.edu/profile/bhaswar/
        user_id = url.split('/')[-2]
    else:
        # extract from https://www.wharton.upenn.edu/faculty/binsbergen.cfm
        user_id = url.replace('.cfm', '').split('/')[-1]
    json = get_json('https://faculty.wharton.upenn.edu/wp-json/wfp/v2/publication/?author=%s&per_page=500&page=1' % user_id)
    if 'data' not in json:
        return None, None
    citations = []
    for paper in json['data']:
        if paper['type'] == 'wfp_pubpubpaper':  # published papers only
            # The 'citation' attribute contains an html-formatted citation. We just convert it to plain text.
            citations.append(tree_from_string(paper['citation']).get_text())
    return url + '#research', citations
コード例 #14
0
def get_title(tag):
    for e in css_select(tag, 'td p')[0].children:
        if isinstance(e, NavigableString):
            e = strip_whitespace(e)
            if len(e) > 0:
                return e
コード例 #15
0
def get_photo_url(prof):
    img_tags = css_select(get_tree(prof.faculty_directory_url),
                          "img.profileImg")
    if len(img_tags) == 0:
        return None
    return "https://www.kellogg.northwestern.edu" + img_tags[0].get("src")
コード例 #16
0
def get_name(tree):
    return ' '.join([
        span.text for span in css_select(tree, 'div.group-wrapper-name span')
    ])
コード例 #17
0
    def scrape_search_results(self, prof: Professor) -> List[Paper]:
        """In this case, we are saving all articles, even if we are not sure that they match the author.
        We only search in the past ten years (2007 and later) and only include the first 100 pages of results,
        and only papers that have at least one citation in Google Scholar (to save us some time)."""
        # parse each page of results, up to at most 1000 articles (100 pages)
        papers = []
        # for each page of results
        for start in range(0, 1000, 10):
            result_row_info = []
            # get search results page
            self.get_page(
                'https://scholar.google.com/scholar?start=%d&as_ylo=%s&q=author%%3A"%s"+%s'
                % (start, STARTING_YEAR, urllib.parse.quote(
                    prof.simple_name()), prof.school))

            # We get the GS and WoS citation counts from the search results page
            # We get the full citation information by virtually clicking the "cite" link for each article
            tree = tree_from_string(self.selenium_driver.page_source)
            for row in css_select(tree, 'div.gs_r div.gs_ri'):
                scholar_citations = None
                wos_citations = None
                citation_id = None
                for link in css_select(row, 'div.gs_fl a'):
                    if 'Cited by' in link.text:
                        scholar_citations = link.text.split(' ')[-1]
                    elif 'Web of Science:' in link.text:
                        wos_citations = link.text.split(': ')[-1]
                    elif 'Related articles' in link.text:
                        citation_id = link.get('href').split(":")[1]
                # ignore papers with no citations
                if not scholar_citations:
                    break
                result_row_info.append({
                    'scholar_citations': scholar_citations,
                    'wos_citations': wos_citations,
                    'citation_id': citation_id
                })
            # stop when we've gone past the end of results
            if len(result_row_info) == 0:
                break

            # fetch each citation and pick out the Chicago format because it has full firstnames
            # and includes all the author names (or at least more of them before using "et al."
            # eg., https://scholar.google.com/scholar?q=info:J2Uvx00ui50J:scholar.google.com/&output=cite&scirp=1&hl=en
            for r in result_row_info:
                self.get_page(
                    'https://scholar.google.com/scholar?q=info:%s:scholar.google.com/'
                    '&output=cite&scirp=1&hl=en' % r['citation_id'])
                # the third row in the table contains the Chicago-style citation
                citation = self.selenium_driver.find_elements_by_css_selector(
                    'td')[2].text
                year = get_year(citation)
                if not year:
                    continue
                # look for the first period that is not part of a middle initial
                match = re.search(r"\w{2}\. ", citation)
                if not match:
                    # otherwise, just take the first period as in:
                    # Al-Najjar, Nabil I. "A bayesian framework for precautionary policies." (2013).
                    match = re.search(r"\. ", citation)
                authors = citation[:match.end()]
                # venue is in italics
                try:
                    venue = self.selenium_driver.find_elements_by_css_selector('td')[2]\
                                                .find_element_by_css_selector('i').text
                except NoSuchElementException:
                    # this is probably a working paper
                    continue
                match = re.findall(
                    r"\"(.*)\"", citation)  # article titles are inside quotes
                if len(match) == 0:
                    # this is a book, which we don't record
                    continue
                title = match[0]
                papers.append(
                    Paper(title=title,
                          authors=authors,
                          venue=venue,
                          year=year,
                          scholar_citations=r['scholar_citations'],
                          wos_citations=r['wos_citations'],
                          id=r['citation_id']))
        return papers
コード例 #18
0
def get_papers(url, tree):
    # find the bulleted list for publications
    return url, [
        strip_whitespace(li.text)
        for li in css_select(tree, 'div#tabs-1 ul li')
    ]