Example #1
0
    def _un_htmlify(self, text):
        def _handle_bad_html(s):
            pttn = re.compile('<|>')
            return pttn.sub(' ', s)

        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                soup = BeautifulSoup(text.strip())
        except UserWarning:
            return ''

        # get all of the text and any a/@href values
        texts = [
            _handle_bad_html(t.strip('"')) for t in soup.find_all(text=True)
        ]
        if self.include_html_hrefs:
            texts += [
                unquote(a['href']) for a in soup.find_all('a')
                if 'href' in a.attrs
            ]

        try:
            text = ' '.join(texts)
        except:
            raise
        return text
    def _extract_url(self, text):
        # but really first, is it a urn?
        text = self._verify_url(text)
        if not text:
            return '', '', []
        url = self._tidy_text(unquote(text))
        base_url, values = break_url(url)
        values = values.split(' ') + [base_url] if base_url else []

        # we're just running with a hack
        if url == 'http://dx.doi.org':
            return '', '', []

        if 'dx.doi.org' in base_url:
            t = 'doi'
        elif 'hdl.handle.net' in base_url:
            t = 'hdl'
        else:
            t = 'url'

        # return the original extracted url, tag, and the values plus
        # the base_url for more extracting
        return url, t, filter(None, [self._tidy_text(v) for v in values])
    def _un_htmlify(self, text):
        def _handle_bad_html(s):
            pttn = re.compile('<|>')
            return pttn.sub(' ', s)

        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                soup = BeautifulSoup(text.strip())
        except UserWarning:
            return ''

        # get all of the text and any a/@href values
        texts = [_handle_bad_html(t.strip('"'))
                 for t in soup.find_all(text=True)]
        if self.include_html_hrefs:
            texts += [unquote(a['href'])
                      for a in soup.find_all('a') if 'href' in a.attrs]

        try:
            text = ' '.join(texts)
        except:
            raise
        return text