def _un_htmlify(self, text): def _handle_bad_html(s): pttn = re.compile('<|>') return pttn.sub(' ', s) try: with warnings.catch_warnings(): warnings.simplefilter("ignore") soup = BeautifulSoup(text.strip()) except UserWarning: return '' # get all of the text and any a/@href values texts = [ _handle_bad_html(t.strip('"')) for t in soup.find_all(text=True) ] if self.include_html_hrefs: texts += [ unquote(a['href']) for a in soup.find_all('a') if 'href' in a.attrs ] try: text = ' '.join(texts) except: raise return text
def _extract_url(self, text): # but really first, is it a urn? text = self._verify_url(text) if not text: return '', '', [] url = self._tidy_text(unquote(text)) base_url, values = break_url(url) values = values.split(' ') + [base_url] if base_url else [] # we're just running with a hack if url == 'http://dx.doi.org': return '', '', [] if 'dx.doi.org' in base_url: t = 'doi' elif 'hdl.handle.net' in base_url: t = 'hdl' else: t = 'url' # return the original extracted url, tag, and the values plus # the base_url for more extracting return url, t, filter(None, [self._tidy_text(v) for v in values])
def _un_htmlify(self, text): def _handle_bad_html(s): pttn = re.compile('<|>') return pttn.sub(' ', s) try: with warnings.catch_warnings(): warnings.simplefilter("ignore") soup = BeautifulSoup(text.strip()) except UserWarning: return '' # get all of the text and any a/@href values texts = [_handle_bad_html(t.strip('"')) for t in soup.find_all(text=True)] if self.include_html_hrefs: texts += [unquote(a['href']) for a in soup.find_all('a') if 'href' in a.attrs] try: text = ' '.join(texts) except: raise return text