Esempi in Python per findValue, esempi in Python per pypub.utils.findValue

Esempio n. 1

0

Mostra file

File: scraper_skeleton.py Progetto: ScholarTools/pypub

    def __init__(self, soup, verbose=False):

        # Get entry content information
        mainContent = soup.find('div', {'class' : 'ArticleHeader'})
        if mainContent is None:
            raise ParseException('Unable to find main content of page')


        # Metadata:
        # --------------
        self.title = findValue(mainContent, 'h1', 'ArticleTitle', 'class').title()

        self.publication = findValue(mainContent, 'span', 'JournalTitle', 'class')

        # Two dates are given: original publication date and
        # online publication date. This returns the original journal pub date.
        yearwrapper = mainContent.find('span', {'class' : 'ArticleCitation_Year'})
        self.date = yearwrapper.find('time').text
        self.year = self.date[-4:]

        self.volume = findValue(mainContent, 'span', 'ArticleCitation_Volume', 'class')

        # SpringerLink doesn't seem to list article pages
        self.pages = None

        # SpringerLink keeps keywords below the abstract, separate from header info
        keybox = soup.find('div', {'class' : 'KeywordGroup'})
        if keybox is None:
            raise ParseException('Unable to find keywords')
        wordlist = keybox.find_all('span', {'class' : 'Keyword'})
        self.keywords = [w.text for w in wordlist]


        # DOI Retrieval:
        # --------------
        # This might be more reliable than assuming we have the DOI in the title
        self.doi = findValue(mainContent, 'p', 'article-doi', 'class')
        doi_startindex = self.doi.find('10.')
        self.doi = self.doi[doi_startindex:] # to get rid of whitespace at the beginning


        # Authors:
        # --------
        # Find list items within the ordered list with id 'authors'
        # Need to find only classless li's so that it doesn't also retrieve the child li's corresponding
        # to author affiliations at this stage.
        authorList = mainContent.find('ul', {'class' : 'authors'}).find_all('li', {'class' : None})
        self.authors = [SpringerAuthor(x) for x in authorList]
        '''

Esempio n. 2

0

Mostra file

File: scraper_skeleton.py Progetto: ScholarTools/pypub

    def __init__(self, ref_tags, ref_id):

        """

        Parameters:
        -----------
        ref_tags: bs4.element.Tag
            Html tags as soup of the reference. Information provided is that
            needed in order to form a citation for the given reference.
        ref_id: int
            The id of the reference as ordered in the citing entry. A value
            of 1 indicates that this object is the first reference in the bibliography.


        """
        super().__init__()
        self.ref_tags = ref_tags

        # Reference Bibliography Section:
        # --------------------------------
        self.ref_id = ref_id + 1 # Input is 0 indexed

        self.title = findValue(ref_tags, 'span', 'articleTitle', 'class')
        authorlist = ref_tags.find_all('span', 'author', 'class')
        self.authors = [x.text for x in authorlist]

        self.publication = findValue(ref_tags, 'span', 'journalTitle', 'class')
        self.volume = findValue(ref_tags, 'span', 'vol', 'class')
        self.date = findValue(ref_tags, 'span', 'pubYear', 'class')

        firstp = findValue(ref_tags, 'span', 'pageFirst', 'class')
        lastp = findValue(ref_tags, 'span', 'pageLast', 'class')
        if (firstp is not None) and (lastp is not None):
            self.pages = firstp + '-' + lastp

        # Reference Meta Section:
        # ------------------------------

        self.crossref = None
        self.pubmed = None
        self.pubmed_central = None
        self.doi = None


        # External links (i.e. PubMed, CrossRef) are kept in a span tag
        links = ref_tags.find('span', 'Occurrences', 'class')

        # Only proceed if either internal or external references were found
        if links is not None:
            links = links.find_all('span', {'class' : 'Occurrence'})

            # Check against all possible link options and save links.
            #
            # NOTE: links are returned URL encoded (using urllib.quote(), but DOI
            # and PubMed IDs are not encoded. This means that if extracting the DOI
            # from one of the returned URLs, it needs to be first unquoted.
            #
            for link in links:
                href = link.find('a', href=True)['href']

                if 'OccurrenceDOI' in link['class']:
                    self.crossref = href
                elif 'OccurrencePID' in link['class']:
                    self.pubmed = href
                elif 'OccurrencePMCID' in link['class']:
                    self.pubmed_central = href

Esempio n. 3

0

Mostra file

File: wiley.py Progetto: gitter-badger/pypub

    def __init__(self, ref_tags, ref_id):

        """

        Parameters:
        -----------
        ref_tags: bs4.element.Tag
            Html tags as soup of the reference. Information provided is that
            needed in order to form a citation for the given reference.
        ref_id: int
            The id of the reference as ordered in the citing entry. A value
            of 1 indicates that this object is the first reference in the bibliography.


        """

        # Reference Bibliography Section:
        #--------------------------------
        self.ref_id = ref_id + 1 # Input is 0 indexed
        self.title = findValue(ref_tags, 'span', 'articleTitle', 'class')
        authorlist = ref_tags.find_all('span', 'author', 'class')
        self.authors = [x.text for x in authorlist]

        # Note: we can also get individual authors if we would like.
        #
        # On Wiley, each reference author is given a separate <span> tag with the class 'author'
        # so individual authors can be extracted
        #

        self.publication = findValue(ref_tags, 'span', 'journalTitle', 'class')
        self.volume = findValue(ref_tags, 'span', 'vol', 'class')
        self.date = findValue(ref_tags, 'span', 'pubYear', 'class')

        firstp = findValue(ref_tags, 'span', 'pageFirst', 'class')
        lastp = findValue(ref_tags, 'span', 'pageLast', 'class')
        if (firstp is not None) and (lastp is not None):
            self.pages = firstp + '-' + lastp
        else:
            self.pages = None


        # Reference Meta Section:
        #------------------------------

        self.crossref = None
        self.pubmed = None
        self.pubmed_id = None
        self.doi = None
        self.citetimes = None
        self.cas = None
        self.abstract = None
        self.pdf_link = None
        self.ref_references = None

        # External links (i.e. PubMed, CrossRef, CAS) are kept in a ul tag
        # Internal links (i.e. direct to abstract, references, etc.) are in a div
        # Need to check for both
        links = ref_tags.find('ul', 'externalReferences', 'class')
        if links is None:
            links = ref_tags.find('div', 'internalReferences', 'class')

        # Only proceed if either internal or external references were found
        if links is not None:
            links = links.find_all('li')

            # Check against all possible link options and save links.
            # href links are appended onto base URL ('http://onlinelibrary.wiley.com')
            #
            for link in links:
                label = link.text.lower()
                href = link.find('a', href=True)['href']
                href = urllib_quote(href)

                if 'crossref' in label:
                    self.doi = href[href.find('10.'):] # Grab everything starting with '10.' in link
                    if self.doi == -1:
                        self.doi = None
                    self.doi = urllib_unquote(self.doi)
                    # CrossRef link is in the form of _WY_URL/resolve/reference/XREF?id=10.#######
                    self.crossref = _WY_URL + urllib_unquote(href)
                elif 'pubmed' in label:
                    self.pubmed_id = re.search('[^id=]+$',href).group(0)[1:] # the [1:] is to get rid of leading '='
                    self.pubmed_id = urllib_unquote(self.pubmed_id)
                    self.pubmed = _WY_URL + urllib_unquote(href)
                elif 'web ' in label:
                    self.citetimes = re.search('[^: ]+$',label).group(0)
                elif label in ('cas', 'cas,'):
                    self.cas = _WY_URL + urllib_unquote(href)
                elif 'abstract' in label:
                    self.abstract = _WY_URL + urllib_unquote(href)
                elif 'pdf' in label:
                    self.pdf_link = _WY_URL + urllib_unquote(href)
                elif 'references' in label:
                    self.ref_references = _WY_URL + urllib_unquote(href)

Esempio n. 4

0

Mostra file

File: wiley.py Progetto: gitter-badger/pypub

    def __init__(self, soup, verbose=False):

        # Get entry content information
        mainContent = soup.find('div', {'id' : 'mainContent'})
        if mainContent is None:
            raise ParseException('Unable to find main content of page')

        # Check for 'Page Not Found'
        error404 = mainContent.find('div', {'id' : 'error'})
        if error404 is not None:
            raise ParseException('Article was not found.')

        # Metadata:
        #---------------
        self.title = findValue(mainContent, 'span', 'mainTitle', 'class').title()
        if self.title is not None:
            self.title = self.title.title()

        self.publication = findValue(mainContent, 'h2', 'productTitle', 'id')

        # For old journal issues, two dates are given: original publication date and
        # online publication date. This returns the original journal pub date.
        self.date = findValue(mainContent, 'span', 'issueDate', 'id')
        self.year = self.date[-4:]

        vol = findValue(mainContent, 'span', 'volumeNumber', 'id')
        vol = vol.lower().replace('volume ', '')
        issue = findValue(mainContent, 'span', 'issueNumber', 'id')
        issue = issue.lower().replace('issue ', '')
        self.volume = vol
        self.issue = issue

        self.pages = findValue(mainContent, 'span', 'issuePages', 'id')
        self.pages = self.pages[6:] # to get rid of 'pages: ' at the beginning


        # Keywords and Abstract:
        #----------
        productContent = soup.find('div', {'id' : 'productContent'})
        keybox = productContent.find('div', {'class' : 'keywordLists'})
        if keybox is None:
            self.keywords = None
        else:
            wordlist = keybox.find_all('li')
            self.keywords = [w.text for w in wordlist]

        abstract_section = productContent.find('div', {'id' : 'abstract'})
        if abstract_section is not None:
            self.abstract = abstract_section.text
        else:
            self.abstract = None


        # DOI Retrieval:
        #---------------
        # This might be more reliable than assuming we have the DOI in the title
        self.doi = findValue(mainContent, 'p', 'doi', 'id')
        self.doi = self.doi[5:] # to get rid of 'DOI: ' at the beginning


        # Authors:
        #---------
        # Find list items within the ordered list with id 'authors'
        authorList = mainContent.find('ol', {'id':'authors'}).find_all('li')
        self.authors = [WileyAuthor(x) for x in authorList]

        # Find all list items with the 'affiliation' class
        # The content is kept in a <p> tag within each list item
        aff_tags = mainContent.find_all('li', {'class' : 'affiliation'})
        self.affiliations = [a.find('p').text for a in aff_tags]

        # Clean up strings - Not sure if necessary
        for a in range(len(self.affiliations)):
            self.affiliations[a] = self.affiliations[a].replace(', and ', '')
            self.affiliations[a] = self.affiliations[a].replace('            ', '')

        corr = mainContent.find('p', {'id' : 'correspondence'})
        if corr is not None:
            email = findValue(corr, 'a', 'Link to email address', 'title')
        else:
            email = ''

        # Assign affiliations to authors
        for author in self.authors:
            author.populate_affiliations(self.affiliations)
            if author.contact == 1:
                author.email = email

Esempio n. 5

0

Mostra file

File: nature_nrg.py Progetto: ScholarTools/pypub

    def __init__(self, ref_tags, ref_id):

        """

        Parameters:
        -----------
        ref_tags: bs4.element.Tag
            Html tags as soup of the reference. Information provided is that
            needed in order to form a citation for the given reference.
        ref_id: int
            The id of the reference as ordered in the citing entry. A value
            of 1 indicates that this object is the first reference in the bibliography.


        """
        super().__init__()

        # Reference Bibliography Section:
        #--------------------------------
        self.ref_id = ref_id + 1 # Input is 0 indexed
        self.title = findValue(ref_tags, 'span', 'title', 'class')
        authorlist = ref_tags.find_all('span', {'class' : 'author'})
        self.authors = [x.text for x in authorlist]

        # Note: we can also get individual authors if we would like.
        #
        # Each reference author is given a separate <span> tag with the class 'author'
        # so individual authors can be extracted
        #

        self.publication = findValue(ref_tags, 'span', 'source-title', 'class')
        self.volume = findValue(ref_tags, 'span', 'volume', 'class')
        self.date = findValue(ref_tags, 'span', 'year', 'class')

        firstp = findValue(ref_tags, 'span', 'start-page', 'class')
        lastp = findValue(ref_tags, 'span', 'end-page', 'class')
        if (firstp is not None) and (lastp is not None):
            self.pages = firstp + '-' + lastp
        else:
            self.pages = None


        # Reference Meta Section:
        #------------------------------

        self.crossref = None
        self.pubmed = None
        self.doi = None
        self.cas = None
        self.isi = None
        self.ads = None


        # All links are kept in a ul tag with the class 'cleared'
        links = ref_tags.find('ul', {'class' : 'cleared'})

        # Only proceed if links are found
        if links is not None:
            links = links.find_all('li')

            # Check against all possible link options and save links.
            #
            for link in links:
                label = link.text.lower()
                href = link.find('a', href=True)['href']

                if 'article' in label:
                    self.doi = href[href.find('10.'):] # Grab everything starting with '10.' in link
                    if self.doi == -1:
                        self.doi = None
                    # Called 'Article' link, but url is http://dx.doi.org/10.######
                    # Redirects to article page
                    self.crossref = href
                elif 'pubmed' in label:
                    self.pubmed = href
                elif 'cas' in label:
                    self.cas = href
                elif 'isi' in label:
                    self.isi = href
                elif 'ads' in label:
                    self.ads = href

Esempio n. 6

0

Mostra file

File: nature_nrg.py Progetto: ScholarTools/pypub

    def __init__(self, soup, verbose=False):
        super().__init__()

        # Get entry content information
        content = soup.find('div', {'id' : 'content'})
        mainContent = content.find('header')
        if mainContent is None:
            raise ParseException('Unable to find main content of page')


        # Metadata:
        # ---------
        self.title = findValue(mainContent, 'h1', 'article-heading', 'class')

        # Isolate the entry citation line
        citation = mainContent.find('dl', {'class' : 'citation'})

        self.publication = findValue(citation, 'dd', 'journal-title', 'class')

        # For old journal issues, two dates are given: original publication date and
        # online publication date. This returns the original journal pub date.
        self.date = citation.find('time').text[1:-1] # Get rid of parentheses surrounding it
        self.year = self.date[-4:]

        # Get rid of commas and whitespace from volume
        vol = findValue(citation, 'dd', 'volume', 'class').replace(',', '')
        self.volume = vol.replace('\n', '')

        self.pages = findValue(citation, 'dd', 'page', 'class')

        # Nature pages for some reason don't list keywords...
        self.keywords = None

        # DOI Retrieval:
        # --------------
        # This might be more reliable than assuming we have the DOI in the title
        self.doi = findValue(citation, 'dd', 'doi', 'class')
        self.doi = self.doi[4:] # to get rid of 'DOI:' at the beginning

        # Abstract:
        # ---------
        self.abstract = ''
        abstract_section = soup.find('div', {'id' : 'abstract'})
        abstract_content = abstract_section.find('p')
        if abstract_content is not None:
            self.abstract = self.abstract + abstract_content.text

        # Authors:
        # --------
        # Find list items within the ordered list with id 'authors'
        authorList = mainContent.find('ul', {'class':'authors'}).find_all('li')
        self.authors = [NatureAuthor(x) for x in authorList]

        # Get list of affiliations from bottom of page
        author_info = soup.find('div', {'id' : 'author-information'})
        aff_section = author_info.find('ol', {'class' : 'affiliations'})
        aff_tags = aff_section.find_all('li', recursive=False)
        self.affiliations = [a.find('h3').text for a in aff_tags]

        corr = author_info.find('a', {'class' : 'contact'})
        corr_name = corr.text
        email = _NT_URL + corr['href']


        # Assign affiliations to authors
        for author in self.authors:
            author.populate_affiliations(self.affiliations)
            if author.name == corr_name:
                author.email = email

Esempio n. 7

0

Mostra file

File: taylorfrancis.py Progetto: ScholarTools/pypub

    def __init__(self, ref_tags, ref_id):

        """

        Parameters:
        -----------
        ref_tags: bs4.element.Tag
            Html tags as soup of the reference. Information provided is that
            needed in order to form a citation for the given reference.
        ref_id: int
            The id of the reference as ordered in the citing entry. A value
            of 1 indicates that this object is the first reference in the bibliography.


        """
        super().__init__()
        self.ref_tags = ref_tags

        # Reference Bibliography Section:
        #--------------------------------
        self.ref_id = ref_id + 1 # Input is 0 indexed

        self.volume = None
        self.pages = None

        all_text = ref_tags.find_all(text=True)
        self.citation = all_text[1]

        # 'all_text' is a list of the text segments within each citation.
        # If it is a short list, it means that the citation is likely a book,
        # and doesn't include page numbers, PMID, DOI, etc.
        if len(all_text) > 5:
            metadata = all_text[3]
            metadata = metadata[2:]  # Get rid of leading '; '
            divider = metadata.find(':')  # This divides volume number from page range
            self.volume = metadata[0:divider]
            self.pages = metadata[divider+1:metadata.find(';')]

        self.date = findValue(ref_tags, 'span')


        # Reference Link Section:
        #------------------------------

        self.crossref = None
        self.pubmed = None
        self.pubmed_id = None
        self.doi = None
        self.web_of_science = None

        # External links (i.e. PubMed, CrossRef) are kept in <a> tags,
        # while the IDs are conveniently kept in <pub-id> tags
        links = ref_tags.find_all('a')
        ids = ref_tags.find_all('pub-id')

        for link in ids:
            id_type = link['pub-id-type']
            if id_type == 'pmid':
                self.pubmed_id = link.text
            elif id_type == 'doi':
                self.doi = link.text

        if links is not None:
            for link in links:
                href = link['href'][1:]  # Get rid of leading '/'
                text = link.text.lower()

                if 'crossref' in text:
                    self.crossref = _TF_URL + href
                elif 'pubmed' in text:
                    self.pubmed = _TF_URL + href
                elif 'science' in text:
                    self.web_of_science = _TF_URL + href

Esempio n. 8

0

Mostra file

File: taylorfrancis.py Progetto: ScholarTools/pypub

    def __init__(self, soup, verbose=False):
        super().__init__()

        # Get entry content information
        mainContent = soup.find('div', {'id': 'journal_content'})
        if mainContent is None:
            mainContent = soup.find('div', {'id': 'pb-page-content'})
        if mainContent is None:
            raise ParseException('Unable to find main content of page')

        # Metadata:
        # ---------
        titlebox = mainContent.find('div', {'class': 'description'})
        if titlebox is not None:
            self.title = titlebox.find('h1').text.title()
        else:
            self.title = None

        import pdb
        pdb.set_trace()

        # This box contains the publication name as well as Volume and Issue
        pubbox = mainContent.find('div', {'class': 'borderedmodule'})
        pubbox = pubbox.find('td')
        self.publication = findValue(pubbox, 'h2')
        if self.publication is not None:
            self.publication = self.publication.strip()

        # Parsing out the integer values of the volume and issue
        vol_issue = pubbox.find('h3')
        if vol_issue is None:
            raise ParseException('Unable to find volume and issue data')
        else:
            vol_issue = vol_issue.text
            issue_index = vol_issue.find('Issue')

            # If an issue number is listed, extract it
            if issue_index != -1:
                vol_text = vol_issue[0:issue_index]
                all_issue_text = vol_issue[issue_index:]
                issue_text = all_issue_text[0:all_issue_text.find(',')]
                issue_num_text = [x for x in issue_text if x.isdigit()]
                self.issue = ''.join(issue_num_text)
            else:
                vol_text = vol_issue
                self.issue = None

            vol_num_text = [x for x in vol_text if x.isdigit()]
            self.volume = ''.join(vol_num_text)



        # Two dates are given: original publication date and
        # online publication date. This returns the original journal pub date.
        datebox = mainContent.find('div', {'class' : 'articleDates'})
        if datebox is None:
            raise ParseException('Unable to find publishing dates')
        alldates = datebox.find_all('li')
        full_date_text = alldates[-1].text
        date_index = full_date_text.find('Published online: ')
        if date_index > -1:
            date = full_date_text[(date_index + 18):]
        else: date = ''

        self.date = date
        self.year = self.date[-4:]

        # Keywords
        # TaylorFrancis keeps keywords below the abstract, separate from header info
        abstract_section = mainContent.find('div', {'class' : 'abstract'})
        keybox = abstract_section.find('ul', {'class' : 'keywords'})
        if keybox is None:
            raise ParseException('Unable to find keywords')
        wordlist = keybox.find_all('li')
        self.keywords = [w.text[0:w.text.find(',')] for w in wordlist]


        metabox = mainContent.find('div', {'class' : 'doiMeta'})

        self.pages = findValue(mainContent, 'div', label_type='class', label_name='pageRange')


        # DOI Retrieval:
        # --------------
        # This might be more reliable than assuming we have the DOI in the title
        self.doi = findValue(metabox, 'dd')
        doi_startindex = self.doi.find('10.')
        self.doi = self.doi[doi_startindex:]  # to get rid of whitespace at the beginning


        # Authors:
        # --------
        # Find list items within the ordered list with id 'authors'
        # Need to find only classless li's so that it doesn't also retrieve the child li's corresponding
        # to author affiliations at this stage.
        authorList = metabox.find_all('span', {'class' : 'hlFld-ContribAuthor'})
        self.authors = [TaylorFrancisAuthor(x) for x in authorList]

        # Find the list of affiliations from the tabbed module at the bottom of the page
        tabModule = mainContent.find('div', {'id' : 'tabModule'})
        aff_list = tabModule.find('ul', {'class' : 'affiliations'})
        affs = aff_list.find_all('li')
        affiliations = []
        for aff in affs:
            affiliations.append(aff.text[1:])  # Get rid of the leading superscript letter

        # Assign affiliations to authors
        for author in self.authors:
            author.populate_affiliations(affiliations)