def extract_data(self, etree, url): """ Extract all possible data about the publication from the web page. @param etree - parsed DOM tree of the web page (has to be instance of lxml.etree._ElementTree) @param url - url of the web page @return RRSPublication object containing extracted data """ assert isinstance(url, basestring) assert isinstance(etree, _ElementTree) #c = Cleaner(scripts=True, javascript=True, comments=True, style=False, # meta=False, page_structure=False, processing_instructions=True, # embedded=True, frames=False, forms=True, annoying_tags=False, # add_nofollow=False, remove_unknown_tags=False) #etree = c.clean_html(etree) self.url = url self.domain = re.sub("http://(www)?", "", self.url).split(".")[0] self._storage = {} self._publ = RRSPublication() cleaned_etree = SimpleHTMLCleaner.clean_html(etree) page = HTMLDocument(cleaned_etree, url) self.pagetext = page.get_etree().getroot().text_content() # parse CSS and metadata on the page page.parse_document() # get data from <meta> tags nad convert to RRS format self._parse_meta(page) # get data on the basis of the text visbility and recognized headers self._parse_visibility(page) # and now guess :) self._find_unbound_entities(page) # and parse BibTeX self._parse_bibtex(page) return self._publ
def extract_data(self, etree, url): """ Extract all possible data about the publication from the web page. @param etree - parsed DOM tree of the web page (has to be instance of lxml.etree._ElementTree) @param url - url of the web page @return RRSPublication object containing extracted data """ assert isinstance(url, basestring) assert isinstance(etree, _ElementTree) #c = Cleaner(scripts=True, javascript=True, comments=True, style=False, # meta=False, page_structure=False, processing_instructions=True, # embedded=True, frames=False, forms=True, annoying_tags=False, # add_nofollow=False, remove_unknown_tags=False) #etree = c.clean_html(etree) self.url = url self.domain = re.sub("http://(www)?", "", self.url).split(".")[0] self._storage= {} self._publ = RRSPublication() cleaned_etree = SimpleHTMLCleaner.clean_html(etree) page = HTMLDocument(cleaned_etree, url) self.pagetext = page.get_etree().getroot().text_content() # parse CSS and metadata on the page page.parse_document() # get data from <meta> tags nad convert to RRS format self._parse_meta(page) # get data on the basis of the text visbility and recognized headers self._parse_visibility(page) # and now guess :) self._find_unbound_entities(page) # and parse BibTeX self._parse_bibtex(page) return self._publ
def __init__(self, elemtree, url): HTMLDocument.__init__(self, elemtree, url) self.regions = []