def parseBouquets(self, xmlnode):
		#print "parsing Bouquets", xmlnode
		list = []
		for bouquet in xmlnode.getElementsByTagName('e2bouquet'):
			bref = urllib_unquote(bouquet.getElementsByTagName('e2bouquetreference')[0].childNodes[0].data)
			bname = urllib_unquote(bouquet.getElementsByTagName('e2bouquetname')[0].childNodes[0].data)
			#print "Bouquet",bref,bname
			list.append({'bname':bname, 'bref':bref, 'services':self.parseServices(bouquet)})
		return list
 def parseBouquets(self, xmlnode):
     # print "parsing Bouquets", xmlnode
     list = []
     for bouquet in xmlnode.getElementsByTagName("e2bouquet"):
         bref = urllib_unquote(bouquet.getElementsByTagName("e2bouquetreference")[0].childNodes[0].data)
         bname = urllib_unquote(bouquet.getElementsByTagName("e2bouquetname")[0].childNodes[0].data)
         # print "Bouquet",bref,bname
         list.append({"bname": bname, "bref": bref, "services": self.parseServices(bouquet)})
     return list
Beispiel #3
	def parseBouquets(self, xmlnode):
		#print "parsing Bouquets", xmlnode
		list = []
		for bouquet in xmlnode.getElementsByTagName('e2bouquet'):
			bref = urllib_unquote(bouquet.getElementsByTagName('e2bouquetreference')[0].childNodes[0].data)
			bname = urllib_unquote(bouquet.getElementsByTagName('e2bouquetname')[0].childNodes[0].data)
			#print "Bouquet",bref,bname
			list.append({'bname':bname, 'bref':bref, 'services':self.parseServices(bouquet)})
		return list
	def parseServices(self, xmlnode):
		#print "parsing Services", xmlnode
		list = []
		for service in xmlnode.getElementsByTagName('e2servicelist')[0].getElementsByTagName('e2service'):
			sref = urllib_unquote(service.getElementsByTagName('e2servicereference')[0].childNodes[0].data)
			sname = urllib_unquote(service.getElementsByTagName('e2servicename')[0].childNodes[0].data)
			sname = sname.replace(self.undefinded_tag, "<n/a>").replace(self.undefinded_and, "&")
			#print sref,sname
			list.append({'sref':sref, 'sname':sname})
		return list
Beispiel #5
	def parseServices(self, xmlnode):
		#print "parsing Services", xmlnode
		list = []
		for service in xmlnode.getElementsByTagName('e2servicelist')[0].getElementsByTagName('e2service'):
			sref = urllib_unquote(service.getElementsByTagName('e2servicereference')[0].childNodes[0].data)
			sname = urllib_unquote(service.getElementsByTagName('e2servicename')[0].childNodes[0].data)
			sname = sname.replace(self.undefinded_tag, "<n/a>").replace(self.undefinded_and, "&")
			#print sref,sname
			list.append({'sref':sref, 'sname':sname})
		return list
def _unquote_path(path):
    # MK1996 says, 'If a %xx encoded octet is encountered it is unencoded 
    # prior to comparison, unless it is the "/" character, which has 
    # special meaning in a path.'
    path = re.sub("%2[fF]", "\n", path)
    path = urllib_unquote(path)
    return path.replace("\n", "%2F")
def _unquote_path(path):
    # MK1996 says, 'If a %xx encoded octet is encountered it is unencoded
    # prior to comparison, unless it is the "/" character, which has
    # special meaning in a path.'
    path = re.sub("%2[fF]", "\n", path)
    path = urllib_unquote(path)
    return path.replace("\n", "%2F")
def _update_counts(eids, resolve_url, s=None):
    Helper for get_references()

    s : Session
        The Requests Session
    eids : list
        List of eids but with a particular format
        e.g. ... TODO
    resolve_url : string
        This is a hardcoded value, eventually we'll pull this from the class

    payload = {'_updateCitedBy': ''.join(eids)}
    # r = s.get(resolve_url, params=payload)
    r = _selenium_connect(resolve_url)

    # TODO: Check for 200
    data = urllib_unquote(r)

    # myXabsCounts['citedBy_26']='Citing Articles (41)';
    cited_by_results = re.findall("myXabsCounts\['citedBy_(\d+)'\]='[^\(]+\((\d+)", data)

    # TODO: parse response
    # ????? Why is the order scrambled - this seems to be on their end ...????
    NOTE: This is now Citing Articles, references to Scopus have been dropped
    myXabsCounts['citedBy_16']='Cited By in Scopus (128)';
    myXabsCounts['citedBy_15']='Cited By in Scopus (25)';
    myXabsCounts['citedBy_1']='Cited By in Scopus (2)';
    myXabsCounts['citedBy_3']='Cited By in Scopus (29)';
    # TODO: go through refs and apply new values ...

    return cited_by_results
Beispiel #9
def get_references(input, verbose=False):
    This function gets references for a Sciencedirect URL that is of the
    Implementation Notes:
    From what I can tell this information is not exposed via the Elsevier API.
    In order to minimize complexity, the mobile site is requested: via a cookie.

    Code Layout and Algorithm Notes:

    # TODO: Make this a class reference parser

    # *** These tags are mobile-site specific

    # When we don't have proper access rights, this is present in the html
    GUEST_TAG_TUPLE = ("li", {"id": "menuGuest"})

    # Entries are "li" tags with classes of the form:
    #   article-reference-article
    #   article-reference-other-ref
    REFERENCE_TAG_TUPLE = ("li", {"class": re.compile('article-reference-*')})

    # This is the URL to the page that contains the document info, including
    # reference material
    BASE_URL = _SD_URL + '/science/article/pii/'

    # This URL was found first via Fiddler, then via closer inspection of the script
    # 'article_catalyst.js' under in the function
    # resolveReferences
    REF_RESOLVER_URL = _SD_URL + '/science/referenceResolution/ajaxRefResol'

    # Return the BeautifulSoup result, the requests session, and the requests response
    if _is_url(input):
        pii = _extract_pii(input)
        pii = input

    sess = requests.Session()

    if verbose:
        print('Requesting main page for pii: %s' % pii)
    resp = sess.get(BASE_URL + pii, cookies={'Site': 'Mobile'})

    # Step 2 - Get the reference tags

    soup = BeautifulSoup(resp.text)

    reference_section = soup.find("ol", {"class": "article-references"})

    if reference_section is None:
        # Then we might be a guest. In other words, we might not have sufficient
        # privileges to access the data we want. Generally this is protected via
        # IP mask. When I'm working from home I need to VPN into work so
        # that I can access the data :/
        print("reference_section is None")
        temp = soup.find(*GUEST_TAG_TUPLE)
        if temp is None:
            # We might have no references ... (Doubtful)
            raise ParseException("References were not found ..., code error likely")
            raise InsufficientCredentialsException(
                "Insufficient access rights to get referencs, requires certain IP addresses (e.g. university based IP)")

    ref_tags = reference_section.find_all(*REFERENCE_TAG_TUPLE)

    n_refs = len(ref_tags)

    if n_refs == 0:
        return None

    # Step 3 - Resolve reference links
    # --------------------------------------------------------------------------
    # The returned html code contains javascript which returns more information
    # about each reference, such as:
    #   - links to the full text
    #   - DOI   

    # Step 3.1 - Make the request for the information
    # --------------------------------------------------------------------------
    # We need the eid of the current entry, it is of the form:
    # = "1-s2.0-0006899387903726"
    #   * I think this entry gets deleted after the requests so it may not be
    #   visible  if looking for it in Chrome. 
    match ='SDM\.pm\.eid\s*=\s*"([^"]+)"', resp.text)
    #eid =

    # This list comes from the resolveReferences function in article_catalyst.js
    payload = {
        '_pii': pii,
        '_refCnt': n_refs,
        '_docType': 'article',  # yikes, this might change ...
        '_refRangeStart': '1',
        '_refRangeCount': str(n_refs)}  # This is normally in sets of 20's ...
    # I'm not sure if it is important to limit this. The browser then
    # makes a request fromr 1 count 20, 21 count 20, 41 count 20 etc,
    # It always goes by 20 even if there aren't 20 left

    if verbose:
        print('Requesting reference links')
    r2 = sess.get(REF_RESOLVER_URL, params=payload)

    # Step 3.2 - Parse the returned information into single entries
    # --------------------------------------------------------------------------
    # This could probably be optimized in terms of execution time. We basically
    # get back a single script tag. Inside is some sort of hash map for links
    # for each reference.
    # The script tag is of the form:
    #   myMap['bibsbref11']['refHtml']= "<some html stuffs>"; 
    #   myMap['bibsbref11']['absUrl']= "";
    #   etc.
    #   - Each entry is quite long.
    #   - Normally contains html
    #   - can be empty i.e. myMap['bibsbref11']['refHtml'] = "";
    #   - the refHtml is quite interesting
    #   - the absolute url is not always present (and currently not parsed)
    more_soup = BeautifulSoup(r2.text)
    script_tag = more_soup.find('script')

    # We unquote the script text as it is transmitted with characters escaped
    # and we want the parsed data to contain the non-escaped text
    # We might eventually want to move this to being after the regular expression ...
    script_text = urllib_unquote(script_tag.text)

    ref_match_result = re.findall("myMap\['bibsbref(\d+)'\]\['refHtml'\]=\s?" + '"([^"]*)";', script_text)
    # Tokens:
    # 0 - the # from bibsbref#
    # 1 - the html content from the 'refHtml' entry
    # NOTE: We don't really use the #, so we might remove the () around
    # \d+ which would shift the index from 1 to 0
    if verbose:
        print('Creating reference objects')

    if len(ref_match_result) > 0:
        zipped = zip(ref_tags, ref_match_result, range(n_refs))
        ref_objects = [ScienceDirectRef(ref_tag, ref_link_info[1], ref_id) for
                       ref_tag, ref_link_info, ref_id in zipped]
        zipped = zip(ref_tags, range(n_refs))
        ref_objects = [ScienceDirectRef(ref_tag, ref_id) for
                       ref_tag, ref_id in zipped]

    # Step 4:
    # --------------------------------------------------------------------------
    # TODO: Improve documentation for this step

    if verbose:
        print('Retrieving Scopus Counts')

    ref_scopus_eids = []  # The Scopus IDs of the references to resolve
    # but with a particular formatting ...
    ref_count = 0  # Number of references we haven't resolved

    ref_count_list = []
    # NOTE: Browser requests these in the reverse order ...
    for ref_id, ref in enumerate(ref_objects):

        if ref._data_sceid is not None:
            ref_scopus_eids.append(ref._data_sceid + ',' + str(ref_id + 1) + '~')
            ref_count += 1

            # If we've got enough, then update the counts
            # The 20 may be arbitrary but it was what was used in original JS
            if ref_count > 20:
                ref_count_list += _update_counts(sess, ref_scopus_eids, REF_RESOLVER_URL)
                ref_count = 0
                ref_scopus_eids = []

    # Get any remaining reference counts
    if ref_count != 0:
        ref_count_list += _update_counts(sess, ref_scopus_eids, REF_RESOLVER_URL)

        # Take the raw data and set the citation count for each object
    for ref_tuple in ref_count_list:
        ref_id = int(ref_tuple[0]) - 1
        ref_count = int(ref_tuple[1])
        ref_objects[ref_id].scopus_cite_count = ref_count

    # All done!
    # ---------
    return ref_objects
Beispiel #10
    def __init__(self, ref_tags, ref_id, ref_link_info=None):
        ref_tags: bs4.element.Tag
            Html tags as soup of the reference. Information provided is that
            needed in order to form a citation for the given reference.
        ref_link_info: str
            Html, not yet souped. Contains extra information such as links to
            a pdf (if known) and other goodies
        ref_id: int
            The id of the reference as ordered in the citing entry. A value
            of 1 indicates that this object is the first reference in the bibliography.

        # Reference Bibliography Section:
        # -------------------------------

        # Example str: <span class="r_volume">Volume 47</span>
        self.ref_id = ref_id + 1  # Input is 0 based
        self.title = findValue(ref_tags, 'li', 'reference-title', 'class')
        all_authors = ref_tags.find_all('span', {'class' : 'reference-author'})
        self.authors = [x.text for x in all_authors]
        #self.authors = findValue(ref_tags, 'li', 'reference-author', 'class')
        # NOTE: We can also get individual authors if we would like.
        #   Search would be on: 
        #       <span class="reference-author">
        #   instead of on the list.

        # Unfortunately r_publication is found both for the title and for
        # the publication. Some custom code is needed to first go into a r_series
        # span and then to the publication
        self.publication = None
        r_source_tag = ref_tags.find('span', {'class': 'r_series'})

        if r_source_tag is not None:
            pub_tag = r_source_tag.find('span', {'class': 'r_publication'})
            if pub_tag is not None:
                self.publication = pub_tag.text.replace('\\xa0', ' ')

        temp_volume = findValue(ref_tags, 'span', 'r_volume', 'class')
        if temp_volume is None:
            self.volume = None
            self.volume = temp_volume.replace('Volume ', '')

        self.issue = findValue(ref_tags, 'span', 'r_issue', 'class')
        self.series = findValue(ref_tags, 'span', 'r_series', 'class') = findValue(ref_tags, 'span', 'r_pubdate', 'class')

        temp_pages = findValue(ref_tags, 'span', 'r_pages', 'class')
        if temp_pages is None:
            self.pages = None
            # TODO: is the unicode working properly ??? 576–577 and ideally 576-577
            self.pages = temp_pages.replace('pp. ', '')

        # Reference Meta Section:
        # -----------------------
        self.scopus_link = None
        self.doi = None
        self._data_sceid = None
        self.pii = None
        self.pdf_link = None
        self.scopus_cite_count = None
        self.aps_full_text = None

        if ref_link_info is not None:
            link_soup = BeautifulSoup(ref_link_info)

            # Each section is contained a div tag with the class boxLink, although
            # some classes have more text in the class attribute, thus the *)
            #box_links = link_soup.find_all('div', {'class': re.compile('boxLink*')})
            box_links = link_soup.find_all('div', {'class' : 'boxLink'})

            # This code is a bit hard to read but each 'if statement' shows what
            # is needed in order to resolve the item.
            for box_link in box_links:
                div_class_values = box_link.attrs['class']
                link_tag = box_link.find('a')
                if 'SC_record' in div_class_values:
                    # "View Record in Scopus"
                    # They changed to returning a full link
                    # I should really use a library to resolve based on both
                    # although the input should be the current page, not the base
                    # self.scopus_link = _SD_URL + link_tag.attrs['href']
                    self.scopus_link = link_tag.attrs['href']
                elif 'class' in link_tag.attrs and 'S_C_pdfLink' in link_tag.attrs['class']:
                        # Link to PDF
                        self.pdf_link = _SD_URL + link_tag.attrs['href']
                elif 'class' in link_tag.attrs and 'cLink' in link_tag.attrs['class']:
                        # Article Link
                        temp = link_tag.attrs['href']
                        match ='/pii/(.*)', temp)
                        self.pii =
                        self.doi = self.doi_from_crossref(self.pii)
                elif 'CrossRef' in box_link.text:
                    # CrossRef link provides DOI as href
                    # In old code it was a query parameter but this
                    # has now moved to a "data-url" attribute
                    temp = link_tag.attrs['href']
                    match ='dx\.doi\.org/(.*)', temp)
                    # Unquote removes %xx escape characters
                    self.doi = urllib_unquote(
                elif "Purchase" in box_link.text:
                    # New link added to Purchase pdf. It was throwing errors
                elif 'aps full text' in box_link.text.lower():
                    self.aps_full_text = link_tag.attrs['href']
                    span_tag = link_tag.find('span')
                    if 'citedBy_' in span_tag.attrs['class']:
                        # Cited By Scopus Count
                        # NOTE: Apparently the citedByScopus doesn't get added
                        # until later so we need to look for the scan tag. Let's
                        # do this only if all else fails.
                        self._data_sceid = span_tag.attrs['data-sceid']
                        raise Exception('Failed to match link')

        # Finally, update if it is not an article
        tag_class = ref_tags.get('class')[0]
        if tag_class == 'article-reference-other-ref':
            publication = ref_tags.find('em')
            if publication is not None:
                self.publication = publication.text
            self.title = ref_tags.text
Beispiel #11
    def __init__(self, ref_tags, ref_id):


        ref_tags: bs4.element.Tag
            Html tags as soup of the reference. Information provided is that
            needed in order to form a citation for the given reference.
        ref_id: int
            The id of the reference as ordered in the citing entry. A value
            of 1 indicates that this object is the first reference in the bibliography.


        # Reference Bibliography Section:
        self.ref_id = ref_id + 1 # Input is 0 indexed
        self.title = findValue(ref_tags, 'span', 'articleTitle', 'class')
        authorlist = ref_tags.find_all('span', 'author', 'class')
        self.authors = [x.text for x in authorlist]

        # Note: we can also get individual authors if we would like.
        # On Wiley, each reference author is given a separate <span> tag with the class 'author'
        # so individual authors can be extracted

        self.publication = findValue(ref_tags, 'span', 'journalTitle', 'class')
        self.volume = findValue(ref_tags, 'span', 'vol', 'class') = findValue(ref_tags, 'span', 'pubYear', 'class')

        firstp = findValue(ref_tags, 'span', 'pageFirst', 'class')
        lastp = findValue(ref_tags, 'span', 'pageLast', 'class')
        if (firstp is not None) and (lastp is not None):
            self.pages = firstp + '-' + lastp
            self.pages = None

        # Reference Meta Section:

        self.crossref = None
        self.pubmed = None
        self.pubmed_id = None
        self.doi = None
        self.citetimes = None
        self.cas = None
        self.abstract = None
        self.pdf_link = None
        self.ref_references = None

        # External links (i.e. PubMed, CrossRef, CAS) are kept in a ul tag
        # Internal links (i.e. direct to abstract, references, etc.) are in a div
        # Need to check for both
        links = ref_tags.find('ul', 'externalReferences', 'class')
        if links is None:
            links = ref_tags.find('div', 'internalReferences', 'class')

        # Only proceed if either internal or external references were found
        if links is not None:
            links = links.find_all('li')

            # Check against all possible link options and save links.
            # href links are appended onto base URL ('')
            for link in links:
                label = link.text.lower()
                href = link.find('a', href=True)['href']
                href = urllib_quote(href)

                if 'crossref' in label:
                    self.doi = href[href.find('10.'):] # Grab everything starting with '10.' in link
                    if self.doi == -1:
                        self.doi = None
                    self.doi = urllib_unquote(self.doi)
                    # CrossRef link is in the form of _WY_URL/resolve/reference/XREF?id=10.#######
                    self.crossref = _WY_URL + urllib_unquote(href)
                elif 'pubmed' in label:
                    self.pubmed_id ='[^id=]+$',href).group(0)[1:] # the [1:] is to get rid of leading '='
                    self.pubmed_id = urllib_unquote(self.pubmed_id)
                    self.pubmed = _WY_URL + urllib_unquote(href)
                elif 'web ' in label:
                    self.citetimes ='[^: ]+$',label).group(0)
                elif label in ('cas', 'cas,'):
                    self.cas = _WY_URL + urllib_unquote(href)
                elif 'abstract' in label:
                    self.abstract = _WY_URL + urllib_unquote(href)
                elif 'pdf' in label:
                    self.pdf_link = _WY_URL + urllib_unquote(href)
                elif 'references' in label:
                    self.ref_references = _WY_URL + urllib_unquote(href)
Beispiel #12
 def unquote(uri):
     """Specialized unquote that uses UTF-8 for parsing."""
     uri = uri.encode("ascii")
     unquoted = urllib_unquote(uri)
     return unquoted.decode("utf-8")
    def __init__(self, ref_tags, ref_id, ref_link_info=None):

        ref_tags: bs4.element.Tag
            Html tags as soup of the reference. Information provided is that
            needed in order to form a citation for the given reference.
        ref_link_info: str
            Html, not yet souped. Contains extra information such as links to
            a pdf (if known) and other goodies
        ref_id: int
            The id of the reference as ordered in the citing entry. A value
            of 1 indicates that this object is the first reference in the bibliography.


        # Reference Bibliography Section:
        # -------------------------------

        ref = ref_tags.find('ul', {'class': 'reference'})

        if ref is None:
            import pdb

        # Example str: <span class="r_volume">Volume 47</span>
        self.ref_id = ref_id + 1  # Input is 0 based
        self.title = findValue(ref, 'li', 'title', 'class')

        all_authors = ref_tags.find('li', {'class' : 'author'})
        author_text = all_authors.text
        author_list = author_text.split(', ')
        self.authors = []
        for x in author_list:
            x = x.strip()

        # Unfortunately r_publication is found both for the title and for
        # the publication. Some custom code is needed to first go into a r_series
        # span and then to the publication
        self.publication = ''
        self.volume = '' = ''
        self.pages = ''

        source = ref.find('li', {'class': 'source'})
        source = source.text
        source_parts = source.split(', ')

        # Attempt to parse publication into sections
        found = 0
        for elt in source_parts:
            # This first part is looking for the date, which is written (xxxx).
            # It is also saved as volume because they are within the same element
            # and sometimes the volume is also in parentheses.
            if '(' in elt:
                found = 1
                self.volume = elt
       = elt

            # Until the date/volume part is found, save everything before as publication.
            # This can either be simple like 'Nature' or more complex, with a description
            # and location, in which case, it would be multiple elements in source_parts.
            if found == 0:
                self.publication = self.publication + elt

            # Find and save the pages portion, usually notated either with 'p. ' or 'pp. '
            if 'p. ' in elt:
                self.pages = elt
                self.pages = self.pages.replace('p. ', '')
                self.pages = self.pages.replace('p', '')

        # This checks if the parsing didn't work. All of the information
        # should still be retained even if not parsed correctly.
        if == '' and self.pages == '':
            self.publication = source

        # Reference Meta Section:
        # -----------------------
        self.scopus_link = None
        self.doi = None
        self._data_sceid = None
        self.pii = None
        self.pdf_link = None
        self.scopus_cite_count = None
        self.aps_full_text = None

        if ref_link_info is None:
            link_soup = ref.find('li', {'class': 'external'})
            link_soup = BeautifulSoup(ref_link_info)

        if link_soup is not None:
            # Each section is contained a div tag with the class boxLink, although
            # some classes have more text in the class attribute, thus the *)
            #box_links = link_soup.find_all('div', {'class': re.compile('boxLink*')})
            box_links = link_soup.find_all('div', {'class' : 'boxLink'})

            # This code is a bit hard to read but each 'if statement' shows what
            # is needed in order to resolve the item.
            for box_link in box_links:
                div_class_values = box_link.attrs['class']
                link_tag = box_link.find('a')
                if 'SC_record' in div_class_values:
                    # "View Record in Scopus"
                    # They changed to returning a full link
                    # I should really use a library to resolve based on both
                    # although the input should be the current page, not the base
                    # self.scopus_link = _SD_URL + link_tag.attrs['href']
                    self.scopus_link = link_tag.attrs['href']
                elif 'class' in link_tag.attrs and 'S_C_pdfLink' in link_tag.attrs['class']:
                        # Link to PDF
                        self.pdf_link = _SD_URL + link_tag.attrs['href']
                elif 'class' in link_tag.attrs and 'cLink' in link_tag.attrs['class']:
                        # Article Link
                        temp = link_tag.attrs['href']
                        match ='/pii/(.*)', temp)
                        self.pii =
                        self.doi = self.doi_from_crossref(self.pii)
                elif 'CrossRef' in box_link.text:
                    # CrossRef link provides DOI as href
                    # In old code it was a query parameter but this
                    # has now moved to a "data-url" attribute
                    temp = link_tag.attrs['href']
                    match ='dx\.doi\.org/(.*)', temp)
                    # Unquote removes %xx escape characters
                    self.doi = urllib_unquote(
                elif "Purchase" in box_link.text:
                    # New link added to Purchase pdf. It was throwing errors
                elif 'aps full text' in box_link.text.lower():
                    self.aps_full_text = link_tag.attrs['href']
                    span_tag = link_tag.find('span')
                    if 'citedBy_' in span_tag.attrs['class']:
                        # Cited By Scopus Count
                        # NOTE: Apparently the citedByScopus doesn't get added
                        # until later so we need to look for the scan tag. Let's
                        # do this only if all else fails.
                        self._data_sceid = span_tag.attrs['data-sceid']
                        raise Exception('Failed to match link')

        # Finally, update if it is not an article
        ref_tag_ul = ref_tags.find('ul')
        tag_class = ref_tag_ul.get('class')
        if tag_class is not None:
            tag_class = tag_class[0]
        if tag_class == 'article-reference-other-ref':
            publication = ref_tags.find('em')
            if publication is not None:
                self.publication = publication.text
            self.title = ref_tags.text
Beispiel #14
 def unquote(uri):
     """Specialized unquote that uses UTF-8 for parsing."""
     uri = uri.encode('ascii')
     unquoted = urllib_unquote(uri)
     return unquoted.decode('utf-8')