Ejemplo n.º 1
0
 def _to_unicode(self, text):
     text = unicode_helpers.to_unicode_or_bust(text)
     if "{" in text:
         text = text.replace("\\", "")
         for i, j in self.bibtex_to_unicode.iteritems():
             text = text.replace(i, j)
     return text
Ejemplo n.º 2
0
 def _to_unicode(self, text):
     text = unicode_helpers.to_unicode_or_bust(text)
     if "{" in text:
         text = text.replace("\\", "")
         for i, j in self.bibtex_to_unicode.iteritems():
             text = text.replace(i, j)
     return text
Ejemplo n.º 3
0
    def _extract_biblio(self, page, id=None):
        biblio_dict = {}

        if not page:
            return biblio_dict

        unicode_page = unicode_helpers.to_unicode_or_bust(page)
        try:
            parsed_html = lxml.html.document_fromstring(unicode_page)

            try:
                response = parsed_html.find(".//title").text
                if response:
                    biblio_dict["title"] = response.strip()
            except AttributeError:
                pass

            try:
                response = parsed_html.find(".//h1").text
                if response:
                    biblio_dict["h1"] = response.strip()
            except AttributeError:
                pass

        # throws ParserError when document is empty
        except (ValueError, lxml.etree.ParserError):
            logger.warning(
                u"%20s couldn't parse %s so giving up on webpage biblio" %
                (self.provider_name, id))
            try:
                response = re.search("<title>(.+?)</title>",
                                     unicode_page).group(1)
                response.replace("\n", "")
                response.replace("\r", "")
                if response:
                    biblio_dict["title"] = response.strip()
            except AttributeError:
                pass
        return biblio_dict
Ejemplo n.º 4
0
    def _extract_biblio(self, page, id=None):
        biblio_dict = {}

        if not page:
            return biblio_dict
        
        unicode_page = unicode_helpers.to_unicode_or_bust(page)
        try:
            parsed_html = lxml.html.document_fromstring(unicode_page)

            try:
                response = parsed_html.find(".//title").text
                if response and response.strip():
                    biblio_dict["title"] = response.strip()
            except AttributeError:
                pass

            try:
                response = parsed_html.find(".//h1").text
                if response and response.strip():
                    biblio_dict["h1"] = response.strip()
            except AttributeError:
                pass            

        # throws ParserError when document is empty        
        except (ValueError, lxml.etree.ParserError):
            logger.warning(u"%20s couldn't parse %s so giving up on webpage biblio" 
                            % (self.provider_name, id)) 
            try:
                response = re.search("<title>(.+?)</title>", unicode_page).group(1)
                response.replace("\n", "")
                response.replace("\r", "")
                if response:
                    biblio_dict["title"] = response.strip()
            except AttributeError:
                pass
        return biblio_dict    
Ejemplo n.º 5
0
 def _to_unicode(self, text):
     text = unicode_helpers.to_unicode_or_bust(text)
     return text
Ejemplo n.º 6
0
 def _to_unicode(self, text):
     text = unicode_helpers.to_unicode_or_bust(text)
     return text