Ejemplo n.º 1
0
 def extract_data(self, etree, url):
     """
     Extract all possible data about the publication from the web page.
     @param etree - parsed DOM tree of the web page (has to be instance of
                    lxml.etree._ElementTree)
     @param url - url of the web page
     @return RRSPublication object containing extracted data
     """
     assert isinstance(url, basestring)
     assert isinstance(etree, _ElementTree)
     #c = Cleaner(scripts=True, javascript=True, comments=True, style=False,
     #            meta=False, page_structure=False, processing_instructions=True,
     #            embedded=True, frames=False, forms=True, annoying_tags=False,
     #            add_nofollow=False, remove_unknown_tags=False)
     #etree = c.clean_html(etree)
     self.url = url
     self.domain = re.sub("http://(www)?", "", self.url).split(".")[0]
     self._storage = {}
     self._publ = RRSPublication()
     cleaned_etree = SimpleHTMLCleaner.clean_html(etree)
     page = HTMLDocument(cleaned_etree, url)
     self.pagetext = page.get_etree().getroot().text_content()
     # parse CSS and metadata on the page
     page.parse_document()
     # get data from <meta> tags nad convert to RRS format
     self._parse_meta(page)
     # get data on the basis of the text visbility and recognized headers
     self._parse_visibility(page)
     # and now guess :)
     self._find_unbound_entities(page)
     # and parse BibTeX
     self._parse_bibtex(page)
     return self._publ
 def extract_data(self, etree, url):
     """
     Extract all possible data about the publication from the web page.
     @param etree - parsed DOM tree of the web page (has to be instance of
                    lxml.etree._ElementTree)
     @param url - url of the web page
     @return RRSPublication object containing extracted data
     """
     assert isinstance(url, basestring)
     assert isinstance(etree, _ElementTree)
     #c = Cleaner(scripts=True, javascript=True, comments=True, style=False,
     #            meta=False, page_structure=False, processing_instructions=True,
     #            embedded=True, frames=False, forms=True, annoying_tags=False,
     #            add_nofollow=False, remove_unknown_tags=False)
     #etree = c.clean_html(etree)
     self.url = url
     self.domain = re.sub("http://(www)?", "", self.url).split(".")[0]
     self._storage= {}
     self._publ = RRSPublication()
     cleaned_etree = SimpleHTMLCleaner.clean_html(etree)
     page = HTMLDocument(cleaned_etree, url)
     self.pagetext = page.get_etree().getroot().text_content()
     # parse CSS and metadata on the page
     page.parse_document()
     # get data from <meta> tags nad convert to RRS format
     self._parse_meta(page)
     # get data on the basis of the text visbility and recognized headers
     self._parse_visibility(page)
     # and now guess :)
     self._find_unbound_entities(page)
     # and parse BibTeX
     self._parse_bibtex(page)
     return self._publ
 def __init__(self, elemtree, url):
     HTMLDocument.__init__(self, elemtree, url)
     self.regions = []
 def __init__(self, elemtree, url):
     HTMLDocument.__init__(self, elemtree, url)
     self.regions = []