Python urllib_unquoteの例、urllib.urllib_unquote Pythonの例

コード例 #1

0

ファイルを表示

ファイル: ServiceListSave.py プロジェクト: BAZANT/enigma2-plugins-sh4

	def parseBouquets(self, xmlnode):
		#print "parsing Bouquets", xmlnode
		list = []
		for bouquet in xmlnode.getElementsByTagName('e2bouquet'):
			bref = urllib_unquote(bouquet.getElementsByTagName('e2bouquetreference')[0].childNodes[0].data)
			bname = urllib_unquote(bouquet.getElementsByTagName('e2bouquetname')[0].childNodes[0].data)
			#print "Bouquet",bref,bname
			list.append({'bname':bname, 'bref':bref, 'services':self.parseServices(bouquet)})
		return list

コード例 #2

0

ファイルを表示

ファイル: ServiceListSave.py プロジェクト: Lishoo/enigma2-plugins-sh4

 def parseBouquets(self, xmlnode):
     # print "parsing Bouquets", xmlnode
     list = []
     for bouquet in xmlnode.getElementsByTagName("e2bouquet"):
         bref = urllib_unquote(bouquet.getElementsByTagName("e2bouquetreference")[0].childNodes[0].data)
         bname = urllib_unquote(bouquet.getElementsByTagName("e2bouquetname")[0].childNodes[0].data)
         # print "Bouquet",bref,bname
         list.append({"bname": bname, "bref": bref, "services": self.parseServices(bouquet)})
     return list

コード例 #3

0

ファイルを表示

	def parseBouquets(self, xmlnode):
		#print "parsing Bouquets", xmlnode
		list = []
		for bouquet in xmlnode.getElementsByTagName('e2bouquet'):
			bref = urllib_unquote(bouquet.getElementsByTagName('e2bouquetreference')[0].childNodes[0].data)
			bname = urllib_unquote(bouquet.getElementsByTagName('e2bouquetname')[0].childNodes[0].data)
			#print "Bouquet",bref,bname
			list.append({'bname':bname, 'bref':bref, 'services':self.parseServices(bouquet)})
		return list

コード例 #4

0

ファイルを表示

ファイル: ServiceListSave.py プロジェクト: BAZANT/enigma2-plugins-sh4

	def parseServices(self, xmlnode):
		#print "parsing Services", xmlnode
		list = []
		for service in xmlnode.getElementsByTagName('e2servicelist')[0].getElementsByTagName('e2service'):
			sref = urllib_unquote(service.getElementsByTagName('e2servicereference')[0].childNodes[0].data)
			sname = urllib_unquote(service.getElementsByTagName('e2servicename')[0].childNodes[0].data)
			sname = sname.replace(self.undefinded_tag, "<n/a>").replace(self.undefinded_and, "&")
			#print sref,sname
			list.append({'sref':sref, 'sname':sname})
		return list

コード例 #5

0

ファイルを表示

	def parseServices(self, xmlnode):
		#print "parsing Services", xmlnode
		list = []
		for service in xmlnode.getElementsByTagName('e2servicelist')[0].getElementsByTagName('e2service'):
			sref = urllib_unquote(service.getElementsByTagName('e2servicereference')[0].childNodes[0].data)
			sname = urllib_unquote(service.getElementsByTagName('e2servicename')[0].childNodes[0].data)
			sname = sname.replace(self.undefinded_tag, "<n/a>").replace(self.undefinded_and, "&")
			#print sref,sname
			list.append({'sref':sref, 'sname':sname})
		return list

コード例 #6

0

ファイルを表示

ファイル: robotexclusionrulesparser.py プロジェクト: Neil-Liang/Migrate_From_Google_Code

def _unquote_path(path):
    # MK1996 says, 'If a %xx encoded octet is encountered it is unencoded 
    # prior to comparison, unless it is the "/" character, which has 
    # special meaning in a path.'
    path = re.sub("%2[fF]", "\n", path)
    path = urllib_unquote(path)
    return path.replace("\n", "%2F")

コード例 #7

0

ファイルを表示

ファイル: robotexclusionrulesparser.py プロジェクト: pombredanne/https-bitbucket.org-philip_semanchuk-robotexclusionrulesparser

def _unquote_path(path):
    # MK1996 says, 'If a %xx encoded octet is encountered it is unencoded
    # prior to comparison, unless it is the "/" character, which has
    # special meaning in a path.'
    path = re.sub("%2[fF]", "\n", path)
    path = urllib_unquote(path)
    return path.replace("\n", "%2F")

コード例 #8

0

ファイルを表示

ファイル: sciencedirect_selenium.py プロジェクト: ScholarTools/pypub

def _update_counts(eids, resolve_url, s=None):
    """
    Helper for get_references()

    Parameters
    ----------
    s : Session
        The Requests Session
    eids : list
        List of eids but with a particular format
        e.g. ... TODO
    resolve_url : string
        This is a hardcoded value, eventually we'll pull this from the class

    Returns
    -------
    """
    payload = {'_updateCitedBy': ''.join(eids)}
    # r = s.get(resolve_url, params=payload)
    r = _selenium_connect(resolve_url)

    # TODO: Check for 200
    data = urllib_unquote(r)

    # myXabsCounts['citedBy_26']='Citing Articles (41)';
    cited_by_results = re.findall("myXabsCounts\['citedBy_(\d+)'\]='[^\(]+\((\d+)", data)

    # TODO: parse response
    # ????? Why is the order scrambled - this seems to be on their end ...????
    '''
    NOTE: This is now Citing Articles, references to Scopus have been dropped
    myXabsCounts['citedBy_16']='Cited By in Scopus (128)';
    myXabsCounts['citedBy_15']='Cited By in Scopus (25)';
    myXabsCounts['citedBy_1']='Cited By in Scopus (2)';
    myXabsCounts['citedBy_3']='Cited By in Scopus (29)';
    '''
    # TODO: go through refs and apply new values ...

    return cited_by_results

コード例 #9

0

ファイルを表示

ファイル: sciencedirect.py プロジェクト: ScholarTools/pypub

def get_references(input, verbose=False):
    """
    This function gets references for a Sciencedirect URL that is of the
    form:
    
        http://www.sciencedirect.com/science/article/pii/################
        
        e.g. http://www.sciencedirect.com/science/article/pii/0006899387903726
        
        
        
    
    Implementation Notes:
    ---------------------
    From what I can tell this information is not exposed via the Elsevier API.
    
    In order to minimize complexity, the mobile site is requested: via a cookie.


    
    Code Layout and Algorithm Notes:
    --------------------------------
    
    """

    # TODO: Make this a class reference parser

    # *** These tags are mobile-site specific

    # When we don't have proper access rights, this is present in the html
    GUEST_TAG_TUPLE = ("li", {"id": "menuGuest"})

    # Entries are "li" tags with classes of the form:
    #   article-reference-article
    #   article-reference-other-ref
    REFERENCE_TAG_TUPLE = ("li", {"class": re.compile('article-reference-*')})

    # This is the URL to the page that contains the document info, including
    # reference material
    BASE_URL = _SD_URL + '/science/article/pii/'

    # This URL was found first via Fiddler, then via closer inspection of the script
    # 'article_catalyst.js' under sciencedirect.com/mobile/js in the function
    # resolveReferences
    REF_RESOLVER_URL = _SD_URL + '/science/referenceResolution/ajaxRefResol'

    # Return the BeautifulSoup result, the requests session, and the requests response
    if _is_url(input):
        pii = _extract_pii(input)
    else:
        pii = input

    sess = requests.Session()

    if verbose:
        print('Requesting main page for pii: %s' % pii)
    resp = sess.get(BASE_URL + pii, cookies={'Site': 'Mobile'})

    # Step 2 - Get the reference tags

    soup = BeautifulSoup(resp.text)

    reference_section = soup.find("ol", {"class": "article-references"})

    if reference_section is None:
        # Then we might be a guest. In other words, we might not have sufficient
        # privileges to access the data we want. Generally this is protected via
        # IP mask. When I'm working from home I need to VPN into work so
        # that I can access the data :/
        print("reference_section is None")
        temp = soup.find(*GUEST_TAG_TUPLE)
        if temp is None:
            # We might have no references ... (Doubtful)
            raise ParseException("References were not found ..., code error likely")
        else:
            raise InsufficientCredentialsException(
                "Insufficient access rights to get referencs, requires certain IP addresses (e.g. university based IP)")

    ref_tags = reference_section.find_all(*REFERENCE_TAG_TUPLE)

    n_refs = len(ref_tags)

    if n_refs == 0:
        return None

    # Step 3 - Resolve reference links
    # --------------------------------------------------------------------------
    # The returned html code contains javascript which returns more information
    # about each reference, such as:
    #
    #   - links to the full text
    #   - DOI   


    # Step 3.1 - Make the request for the information
    # --------------------------------------------------------------------------
    # We need the eid of the current entry, it is of the form:
    #
    #   SDM.pm.eid = "1-s2.0-0006899387903726"
    #
    #   * I think this entry gets deleted after the requests so it may not be
    #   visible  if looking for it in Chrome. 
    match = re.search('SDM\.pm\.eid\s*=\s*"([^"]+)"', resp.text)
    #eid = match.group(1)

    # This list comes from the resolveReferences function in article_catalyst.js
    payload = {
        '_pii': pii,
        '_refCnt': n_refs,
        '_docType': 'article',  # yikes, this might change ...
        '_refRangeStart': '1',
        '_refRangeCount': str(n_refs)}  # This is normally in sets of 20's ...
    # I'm not sure if it is important to limit this. The browser then
    # makes a request fromr 1 count 20, 21 count 20, 41 count 20 etc,
    # It always goes by 20 even if there aren't 20 left

    if verbose:
        print('Requesting reference links')
    r2 = sess.get(REF_RESOLVER_URL, params=payload)

    # Step 3.2 - Parse the returned information into single entries
    # --------------------------------------------------------------------------
    # This could probably be optimized in terms of execution time. We basically
    # get back a single script tag. Inside is some sort of hash map for links
    # for each reference.
    #
    # The script tag is of the form:
    #   myMap['bibsbref11']['refHtml']= "<some html stuffs>"; 
    #   myMap['bibsbref11']['absUrl']= "http://www.sciencedirect.com/science/absref/sd/0018506X7790068X";
    #   etc.
    #
    #   - Each entry is quite long.
    #   - Normally contains html
    #   - can be empty i.e. myMap['bibsbref11']['refHtml'] = "";
    #   - the refHtml is quite interesting
    #   - the absolute url is not always present (and currently not parsed)
    more_soup = BeautifulSoup(r2.text)
    script_tag = more_soup.find('script')

    # We unquote the script text as it is transmitted with characters escaped
    # and we want the parsed data to contain the non-escaped text
    #
    # We might eventually want to move this to being after the regular expression ...
    script_text = urllib_unquote(script_tag.text)

    ref_match_result = re.findall("myMap\['bibsbref(\d+)'\]\['refHtml'\]=\s?" + '"([^"]*)";', script_text)
    # Tokens:
    # 0 - the # from bibsbref#
    # 1 - the html content from the 'refHtml' entry
    #    
    # NOTE: We don't really use the #, so we might remove the () around
    # \d+ which would shift the index from 1 to 0
    if verbose:
        print('Creating reference objects')

    if len(ref_match_result) > 0:
        zipped = zip(ref_tags, ref_match_result, range(n_refs))
        ref_objects = [ScienceDirectRef(ref_tag, ref_link_info[1], ref_id) for
                       ref_tag, ref_link_info, ref_id in zipped]
    else:
        zipped = zip(ref_tags, range(n_refs))
        ref_objects = [ScienceDirectRef(ref_tag, ref_id) for
                       ref_tag, ref_id in zipped]

    # Step 4:
    # --------------------------------------------------------------------------
    # TODO: Improve documentation for this step

    if verbose:
        print('Retrieving Scopus Counts')

    ref_scopus_eids = []  # The Scopus IDs of the references to resolve
    # but with a particular formatting ...
    ref_count = 0  # Number of references we haven't resolved

    ref_count_list = []
    # NOTE: Browser requests these in the reverse order ...
    for ref_id, ref in enumerate(ref_objects):

        if ref._data_sceid is not None:
            ref_scopus_eids.append(ref._data_sceid + ',' + str(ref_id + 1) + '~')
            ref_count += 1

            # If we've got enough, then update the counts
            # The 20 may be arbitrary but it was what was used in original JS
            if ref_count > 20:
                ref_count_list += _update_counts(sess, ref_scopus_eids, REF_RESOLVER_URL)
                ref_count = 0
                ref_scopus_eids = []

    # Get any remaining reference counts
    if ref_count != 0:
        ref_count_list += _update_counts(sess, ref_scopus_eids, REF_RESOLVER_URL)

        # Take the raw data and set the citation count for each object
    for ref_tuple in ref_count_list:
        ref_id = int(ref_tuple[0]) - 1
        ref_count = int(ref_tuple[1])
        ref_objects[ref_id].scopus_cite_count = ref_count

    # All done!
    # ---------
    return ref_objects

コード例 #10

0

ファイルを表示

ファイル: sciencedirect.py プロジェクト: ScholarTools/pypub

    def __init__(self, ref_tags, ref_id, ref_link_info=None):
        """
     
        Parameters:
        -----------
        ref_tags: bs4.element.Tag
            Html tags as soup of the reference. Information provided is that
            needed in order to form a citation for the given reference.
        ref_link_info: str
            Html, not yet souped. Contains extra information such as links to
            a pdf (if known) and other goodies
        ref_id: int
            The id of the reference as ordered in the citing entry. A value
            of 1 indicates that this object is the first reference in the bibliography.
            
     
        """
        super().__init__()

        # Reference Bibliography Section:
        # -------------------------------

        # Example str: <span class="r_volume">Volume 47</span>
        self.ref_id = ref_id + 1  # Input is 0 based
        self.title = findValue(ref_tags, 'li', 'reference-title', 'class')
        all_authors = ref_tags.find_all('span', {'class' : 'reference-author'})
        self.authors = [x.text for x in all_authors]
        #self.authors = findValue(ref_tags, 'li', 'reference-author', 'class')
        # NOTE: We can also get individual authors if we would like.
        #        
        #   Search would be on: 
        #       <span class="reference-author">
        #   instead of on the list.

        # Unfortunately r_publication is found both for the title and for
        # the publication. Some custom code is needed to first go into a r_series
        # span and then to the publication
        self.publication = None
        r_source_tag = ref_tags.find('span', {'class': 'r_series'})

        if r_source_tag is not None:
            pub_tag = r_source_tag.find('span', {'class': 'r_publication'})
            if pub_tag is not None:
                self.publication = pub_tag.text.replace('\\xa0', ' ')

        temp_volume = findValue(ref_tags, 'span', 'r_volume', 'class')
        if temp_volume is None:
            self.volume = None
        else:
            self.volume = temp_volume.replace('Volume ', '')

        self.issue = findValue(ref_tags, 'span', 'r_issue', 'class')
        self.series = findValue(ref_tags, 'span', 'r_series', 'class')
        self.date = findValue(ref_tags, 'span', 'r_pubdate', 'class')

        temp_pages = findValue(ref_tags, 'span', 'r_pages', 'class')
        if temp_pages is None:
            self.pages = None
        else:
            # TODO: is the unicode working properly ??? 576â€“577 and ideally 576-577
            self.pages = temp_pages.replace('pp. ', '')

        # Reference Meta Section:
        # -----------------------
        self.scopus_link = None
        self.doi = None
        self._data_sceid = None
        self.pii = None
        self.pdf_link = None
        self.scopus_cite_count = None
        self.aps_full_text = None

        if ref_link_info is not None:
            link_soup = BeautifulSoup(ref_link_info)

            # Each section is contained a div tag with the class boxLink, although
            # some classes have more text in the class attribute, thus the *)
            #box_links = link_soup.find_all('div', {'class': re.compile('boxLink*')})
            box_links = link_soup.find_all('div', {'class' : 'boxLink'})

            # This code is a bit hard to read but each 'if statement' shows what
            # is needed in order to resolve the item.
            for box_link in box_links:
                div_class_values = box_link.attrs['class']
                link_tag = box_link.find('a')
                if 'SC_record' in div_class_values:
                    # "View Record in Scopus"
                    # They changed to returning a full link
                    # I should really use a library to resolve based on both
                    # although the input should be the current page, not the base
                    # self.scopus_link = _SD_URL + link_tag.attrs['href']
                    self.scopus_link = link_tag.attrs['href']
                elif 'class' in link_tag.attrs and 'S_C_pdfLink' in link_tag.attrs['class']:
                        # Link to PDF
                        self.pdf_link = _SD_URL + link_tag.attrs['href']
                elif 'class' in link_tag.attrs and 'cLink' in link_tag.attrs['class']:
                        # Article Link
                        temp = link_tag.attrs['href']
                        match = re.search('/pii/(.*)', temp)
                        self.pii = match.group(1)
                        self.doi = self.doi_from_crossref(self.pii)
                elif 'CrossRef' in box_link.text:
                    # CrossRef link provides DOI as href
                    # In old code it was a query parameter but this
                    # has now moved to a "data-url" attribute
                    temp = link_tag.attrs['href']
                    # http://dx.doi.org/10.1037%2Fh0075243
                    match = re.search('dx\.doi\.org/(.*)', temp)
                    # Unquote removes %xx escape characters
                    self.doi = urllib_unquote(match.group(1))
                elif "Purchase" in box_link.text:
                    # New link added to Purchase pdf. It was throwing errors
                    pass
                elif 'aps full text' in box_link.text.lower():
                    self.aps_full_text = link_tag.attrs['href']
                else:
                    span_tag = link_tag.find('span')
                    if 'citedBy_' in span_tag.attrs['class']:
                        # Cited By Scopus Count
                        #
                        # NOTE: Apparently the citedByScopus doesn't get added
                        # until later so we need to look for the scan tag. Let's
                        # do this only if all else fails.
                        self._data_sceid = span_tag.attrs['data-sceid']
                    else:
                        raise Exception('Failed to match link')

        # Finally, update if it is not an article
        tag_class = ref_tags.get('class')[0]
        if tag_class == 'article-reference-other-ref':
            publication = ref_tags.find('em')
            if publication is not None:
                self.publication = publication.text
            self.title = ref_tags.text

コード例 #11

0

ファイルを表示

ファイル: wiley.py プロジェクト: gitter-badger/pypub

    def __init__(self, ref_tags, ref_id):

        """

        Parameters:
        -----------
        ref_tags: bs4.element.Tag
            Html tags as soup of the reference. Information provided is that
            needed in order to form a citation for the given reference.
        ref_id: int
            The id of the reference as ordered in the citing entry. A value
            of 1 indicates that this object is the first reference in the bibliography.


        """

        # Reference Bibliography Section:
        #--------------------------------
        self.ref_id = ref_id + 1 # Input is 0 indexed
        self.title = findValue(ref_tags, 'span', 'articleTitle', 'class')
        authorlist = ref_tags.find_all('span', 'author', 'class')
        self.authors = [x.text for x in authorlist]

        # Note: we can also get individual authors if we would like.
        #
        # On Wiley, each reference author is given a separate <span> tag with the class 'author'
        # so individual authors can be extracted
        #

        self.publication = findValue(ref_tags, 'span', 'journalTitle', 'class')
        self.volume = findValue(ref_tags, 'span', 'vol', 'class')
        self.date = findValue(ref_tags, 'span', 'pubYear', 'class')

        firstp = findValue(ref_tags, 'span', 'pageFirst', 'class')
        lastp = findValue(ref_tags, 'span', 'pageLast', 'class')
        if (firstp is not None) and (lastp is not None):
            self.pages = firstp + '-' + lastp
        else:
            self.pages = None


        # Reference Meta Section:
        #------------------------------

        self.crossref = None
        self.pubmed = None
        self.pubmed_id = None
        self.doi = None
        self.citetimes = None
        self.cas = None
        self.abstract = None
        self.pdf_link = None
        self.ref_references = None

        # External links (i.e. PubMed, CrossRef, CAS) are kept in a ul tag
        # Internal links (i.e. direct to abstract, references, etc.) are in a div
        # Need to check for both
        links = ref_tags.find('ul', 'externalReferences', 'class')
        if links is None:
            links = ref_tags.find('div', 'internalReferences', 'class')

        # Only proceed if either internal or external references were found
        if links is not None:
            links = links.find_all('li')

            # Check against all possible link options and save links.
            # href links are appended onto base URL ('http://onlinelibrary.wiley.com')
            #
            for link in links:
                label = link.text.lower()
                href = link.find('a', href=True)['href']
                href = urllib_quote(href)

                if 'crossref' in label:
                    self.doi = href[href.find('10.'):] # Grab everything starting with '10.' in link
                    if self.doi == -1:
                        self.doi = None
                    self.doi = urllib_unquote(self.doi)
                    # CrossRef link is in the form of _WY_URL/resolve/reference/XREF?id=10.#######
                    self.crossref = _WY_URL + urllib_unquote(href)
                elif 'pubmed' in label:
                    self.pubmed_id = re.search('[^id=]+$',href).group(0)[1:] # the [1:] is to get rid of leading '='
                    self.pubmed_id = urllib_unquote(self.pubmed_id)
                    self.pubmed = _WY_URL + urllib_unquote(href)
                elif 'web ' in label:
                    self.citetimes = re.search('[^: ]+$',label).group(0)
                elif label in ('cas', 'cas,'):
                    self.cas = _WY_URL + urllib_unquote(href)
                elif 'abstract' in label:
                    self.abstract = _WY_URL + urllib_unquote(href)
                elif 'pdf' in label:
                    self.pdf_link = _WY_URL + urllib_unquote(href)
                elif 'references' in label:
                    self.ref_references = _WY_URL + urllib_unquote(href)

コード例 #12

0

ファイルを表示

ファイル: compat.py プロジェクト: rkuska/pyldap

 def unquote(uri):
     """Specialized unquote that uses UTF-8 for parsing."""
     uri = uri.encode("ascii")
     unquoted = urllib_unquote(uri)
     return unquoted.decode("utf-8")

コード例 #13

0

ファイルを表示

ファイル: sciencedirect_selenium.py プロジェクト: ScholarTools/pypub

    def __init__(self, ref_tags, ref_id, ref_link_info=None):
        """

        Parameters:
        -----------
        ref_tags: bs4.element.Tag
            Html tags as soup of the reference. Information provided is that
            needed in order to form a citation for the given reference.
        ref_link_info: str
            Html, not yet souped. Contains extra information such as links to
            a pdf (if known) and other goodies
        ref_id: int
            The id of the reference as ordered in the citing entry. A value
            of 1 indicates that this object is the first reference in the bibliography.


        """
        super().__init__()

        # Reference Bibliography Section:
        # -------------------------------

        ref = ref_tags.find('ul', {'class': 'reference'})

        if ref is None:
            import pdb
            pdb.set_trace()

        # Example str: <span class="r_volume">Volume 47</span>
        self.ref_id = ref_id + 1  # Input is 0 based
        self.title = findValue(ref, 'li', 'title', 'class')

        all_authors = ref_tags.find('li', {'class' : 'author'})
        author_text = all_authors.text
        author_list = author_text.split(', ')
        self.authors = []
        for x in author_list:
            x = x.strip()
            self.authors.append(x)

        # Unfortunately r_publication is found both for the title and for
        # the publication. Some custom code is needed to first go into a r_series
        # span and then to the publication
        self.publication = ''
        self.volume = ''
        self.date = ''
        self.pages = ''

        source = ref.find('li', {'class': 'source'})
        source = source.text
        source_parts = source.split(', ')

        # Attempt to parse publication into sections
        found = 0
        for elt in source_parts:
            # This first part is looking for the date, which is written (xxxx).
            # It is also saved as volume because they are within the same element
            # and sometimes the volume is also in parentheses.
            if '(' in elt:
                found = 1
                self.volume = elt
                self.date = elt

            # Until the date/volume part is found, save everything before as publication.
            # This can either be simple like 'Nature' or more complex, with a description
            # and location, in which case, it would be multiple elements in source_parts.
            if found == 0:
                self.publication = self.publication + elt

            # Find and save the pages portion, usually notated either with 'p. ' or 'pp. '
            if 'p. ' in elt:
                self.pages = elt
                self.pages = self.pages.replace('p. ', '')
                self.pages = self.pages.replace('p', '')

        # This checks if the parsing didn't work. All of the information
        # should still be retained even if not parsed correctly.
        if self.date == '' and self.pages == '':
            self.publication = source


        # Reference Meta Section:
        # -----------------------
        self.scopus_link = None
        self.doi = None
        self._data_sceid = None
        self.pii = None
        self.pdf_link = None
        self.scopus_cite_count = None
        self.aps_full_text = None

        if ref_link_info is None:
            link_soup = ref.find('li', {'class': 'external'})
        else:
            link_soup = BeautifulSoup(ref_link_info)

        if link_soup is not None:
            # Each section is contained a div tag with the class boxLink, although
            # some classes have more text in the class attribute, thus the *)
            #box_links = link_soup.find_all('div', {'class': re.compile('boxLink*')})
            box_links = link_soup.find_all('div', {'class' : 'boxLink'})

            # This code is a bit hard to read but each 'if statement' shows what
            # is needed in order to resolve the item.
            for box_link in box_links:
                div_class_values = box_link.attrs['class']
                link_tag = box_link.find('a')
                if 'SC_record' in div_class_values:
                    # "View Record in Scopus"
                    # They changed to returning a full link
                    # I should really use a library to resolve based on both
                    # although the input should be the current page, not the base
                    # self.scopus_link = _SD_URL + link_tag.attrs['href']
                    self.scopus_link = link_tag.attrs['href']
                elif 'class' in link_tag.attrs and 'S_C_pdfLink' in link_tag.attrs['class']:
                        # Link to PDF
                        self.pdf_link = _SD_URL + link_tag.attrs['href']
                elif 'class' in link_tag.attrs and 'cLink' in link_tag.attrs['class']:
                        # Article Link
                        temp = link_tag.attrs['href']
                        match = re.search('/pii/(.*)', temp)
                        self.pii = match.group(1)
                        self.doi = self.doi_from_crossref(self.pii)
                elif 'CrossRef' in box_link.text:
                    # CrossRef link provides DOI as href
                    # In old code it was a query parameter but this
                    # has now moved to a "data-url" attribute
                    temp = link_tag.attrs['href']
                    # http://dx.doi.org/10.1037%2Fh0075243
                    match = re.search('dx\.doi\.org/(.*)', temp)
                    # Unquote removes %xx escape characters
                    self.doi = urllib_unquote(match.group(1))
                elif "Purchase" in box_link.text:
                    # New link added to Purchase pdf. It was throwing errors
                    pass
                elif 'aps full text' in box_link.text.lower():
                    self.aps_full_text = link_tag.attrs['href']
                else:
                    span_tag = link_tag.find('span')
                    if 'citedBy_' in span_tag.attrs['class']:
                        # Cited By Scopus Count
                        #
                        # NOTE: Apparently the citedByScopus doesn't get added
                        # until later so we need to look for the scan tag. Let's
                        # do this only if all else fails.
                        self._data_sceid = span_tag.attrs['data-sceid']
                    else:
                        raise Exception('Failed to match link')

        # Finally, update if it is not an article
        ref_tag_ul = ref_tags.find('ul')
        tag_class = ref_tag_ul.get('class')
        if tag_class is not None:
            tag_class = tag_class[0]
        if tag_class == 'article-reference-other-ref':
            publication = ref_tags.find('em')
            if publication is not None:
                self.publication = publication.text
            self.title = ref_tags.text

コード例 #14

0

ファイルを表示

 def unquote(uri):
     """Specialized unquote that uses UTF-8 for parsing."""
     uri = uri.encode('ascii')
     unquoted = urllib_unquote(uri)
     return unquoted.decode('utf-8')