コード例 #1
0
class HTMLSequenceWrapperRecord(object):
    def __init__(self, element, url, mintextlen=10):
        self.cleaner = SimpleHTMLCleaner()
        self.mintextlen = mintextlen
        self.elem = element
        self.url = url

        # the whole text
        self.text = self.elem.text_content()
        self.text = self.cleaner.clean(self.text)

        self.chunks = []
        self.__extract_chunks(self.elem)

    def has_value(self):
        if self.cleaner.contains_text(self.text) == False:
            return False
        return len(self.text) > self.mintextlen

    def get_chunks(self):
        return self.chunks

    def get_text(self):
        return self.text

    def _handle_elem(self, elem):
        if elem.text == None: return None
        if not self.cleaner.contains_text(elem): return None
        # new chunk
        chunk = TextChunk()

        ## extracting links
        if elem.get('href') != None:
            chunk.set_link(elem.get('href'))
        # extracting 'title' atribute in anchor
        if elem.tag == 'a' and elem.get('title') != None:
            chunk.set_comment(elem.get('title'))

        # extracting text
        txt = elem.text_content()
        chunk.set_text(self.cleaner.clean(txt))

        # setting style
        fs = elem.style
        chunk.set_style(fs)
        chunk.set_tag(elem.tag)
        return chunk

    def __extract_chunks(self, elem):
        thischunk = self._handle_elem(elem)
        if thischunk != None:
            self.chunks.append(thischunk)
        for child in elem.iterchildren():
            self.__extract_chunks(child)

    def __str__(self):
        return "<" + __modulename__ + ".HTMLSequenceWrapperRecord instance " + self.text + " >"
コード例 #2
0
    def __init__(self, element, url, mintextlen=10):
        self.cleaner = SimpleHTMLCleaner()
        self.mintextlen = mintextlen
        self.elem = element
        self.url = url

        # the whole text
        self.text = self.elem.text_content()
        self.text = self.cleaner.clean(self.text)

        self.chunks = []
        self.__extract_chunks(self.elem)
コード例 #3
0
    def __init__(self,
                 childcoef=7.0,
                 headercoef=4.0,
                 mintextlen=10,
                 omitted_tags=('option', 'br', 'select', 'form')):
        self.sequences = {}
        self.childcoef = childcoef
        self.headercoef = headercoef
        self.mintextlen = mintextlen
        self.omitted_tags = omitted_tags

        self.records = []
        self.cleaner = SimpleHTMLCleaner()
 def extract_data(self, etree, url):
     """
     Extract all possible data about the publication from the web page.
     @param etree - parsed DOM tree of the web page (has to be instance of
                    lxml.etree._ElementTree)
     @param url - url of the web page
     @return RRSPublication object containing extracted data
     """
     assert isinstance(url, basestring)
     assert isinstance(etree, _ElementTree)
     #c = Cleaner(scripts=True, javascript=True, comments=True, style=False,
     #            meta=False, page_structure=False, processing_instructions=True,
     #            embedded=True, frames=False, forms=True, annoying_tags=False,
     #            add_nofollow=False, remove_unknown_tags=False)
     #etree = c.clean_html(etree)
     self.url = url
     self.domain = re.sub("http://(www)?", "", self.url).split(".")[0]
     self._storage= {}
     self._publ = RRSPublication()
     cleaned_etree = SimpleHTMLCleaner.clean_html(etree)
     page = HTMLDocument(cleaned_etree, url)
     self.pagetext = page.get_etree().getroot().text_content()
     # parse CSS and metadata on the page
     page.parse_document()
     # get data from <meta> tags nad convert to RRS format
     self._parse_meta(page)
     # get data on the basis of the text visbility and recognized headers
     self._parse_visibility(page)
     # and now guess :)
     self._find_unbound_entities(page)
     # and parse BibTeX
     self._parse_bibtex(page)
     return self._publ
 def _find_abstract(self, etree):
     c = Cleaner(scripts=True, javascript=True, comments=True, style=True,
                 meta=True, page_structure=False, processing_instructions=True,
                 embedded=True, frames=False, forms=True, annoying_tags=True,
                 add_nofollow=False, remove_unknown_tags=False)
     etree_copy = deepcopy(etree)
     etree_copy = c.clean_html(etree_copy)
     html = tostring(etree_copy.getroot())
     # XXX this may be probably useful, to delete all <p> tags...
     html = re.sub("</?p[^>]*>", " ", html)
     possible = []
     txts = re.findall("(?<=\>)[^>]+(?=\<)", html, re.U)
     for txt in txts:
         txt = SimpleHTMLCleaner.clean(txt)
         if len(txt) > 200:
             do_not_append = False
             for bl in self._abstract_blacklist:
                 if txt.startswith(bl):
                     do_not_append = True
                     break
             if not do_not_append:
                 possible.append(txt)
                 continue
         for st in self._abstract_startswith:
             if txt.startswith(st):
                 possible.append(txt)
                 break
     return self._get_longest_string(possible)
コード例 #6
0
 def extract_data(self, etree, url):
     """
     Extract all possible data about the publication from the web page.
     @param etree - parsed DOM tree of the web page (has to be instance of
                    lxml.etree._ElementTree)
     @param url - url of the web page
     @return RRSPublication object containing extracted data
     """
     assert isinstance(url, basestring)
     assert isinstance(etree, _ElementTree)
     #c = Cleaner(scripts=True, javascript=True, comments=True, style=False,
     #            meta=False, page_structure=False, processing_instructions=True,
     #            embedded=True, frames=False, forms=True, annoying_tags=False,
     #            add_nofollow=False, remove_unknown_tags=False)
     #etree = c.clean_html(etree)
     self.url = url
     self.domain = re.sub("http://(www)?", "", self.url).split(".")[0]
     self._storage = {}
     self._publ = RRSPublication()
     cleaned_etree = SimpleHTMLCleaner.clean_html(etree)
     page = HTMLDocument(cleaned_etree, url)
     self.pagetext = page.get_etree().getroot().text_content()
     # parse CSS and metadata on the page
     page.parse_document()
     # get data from <meta> tags nad convert to RRS format
     self._parse_meta(page)
     # get data on the basis of the text visbility and recognized headers
     self._parse_visibility(page)
     # and now guess :)
     self._find_unbound_entities(page)
     # and parse BibTeX
     self._parse_bibtex(page)
     return self._publ
    def __init__(self, childcoef=7.0, headercoef=4.0, mintextlen=10, omitted_tags=('option', 'br', 'select', 'form')):
        self.sequences = {}
        self.childcoef = childcoef
        self.headercoef = headercoef
        self.mintextlen = mintextlen
        self.omitted_tags = omitted_tags

        self.records = []
        self.cleaner = SimpleHTMLCleaner()
    def __init__(self, element, url, mintextlen=10):
        self.cleaner = SimpleHTMLCleaner()
        self.mintextlen = mintextlen
        self.elem = element
        self.url = url

        # the whole text
        self.text = self.elem.text_content()
        self.text = self.cleaner.clean(self.text)

        self.chunks = []
        self.__extract_chunks(self.elem)
    def generalize(self, term):
        # preprocessing
        term = term.lower()
        term = re.sub("[\"\'0-9]+", "", term)
        term = re.sub("[_:\-\.\,]+", " ", term)
        term =  SimpleHTMLCleaner.clean(term)

        # if it is lemma, its OK
        if self.is_lemma(term):
            return term
        # if it isnt lemma, do lookup
        try:
            index = self.term2lemma[term]
            return self.lemmas[index]
        except KeyError:
            return None
コード例 #10
0
    def generalize(self, term):
        # preprocessing
        term = term.lower()
        term = re.sub("[\"\'0-9]+", "", term)
        term = re.sub("[_:\-\.\,]+", " ", term)
        term = SimpleHTMLCleaner.clean(term)

        # if it is lemma, its OK
        if self.is_lemma(term):
            return term
        # if it isnt lemma, do lookup
        try:
            index = self.term2lemma[term]
            return self.lemmas[index]
        except KeyError:
            return None
コード例 #11
0
 def _find_abstract(self, etree):
     c = Cleaner(scripts=True,
                 javascript=True,
                 comments=True,
                 style=True,
                 meta=True,
                 page_structure=False,
                 processing_instructions=True,
                 embedded=True,
                 frames=False,
                 forms=True,
                 annoying_tags=True,
                 add_nofollow=False,
                 remove_unknown_tags=False)
     etree_copy = deepcopy(etree)
     etree_copy = c.clean_html(etree_copy)
     html = tostring(etree_copy.getroot())
     # XXX this may be probably useful, to delete all <p> tags...
     html = re.sub("</?p[^>]*>", " ", html)
     possible = []
     txts = re.findall("(?<=\>)[^>]+(?=\<)", html, re.U)
     for txt in txts:
         txt = SimpleHTMLCleaner.clean(txt)
         if len(txt) > 200:
             do_not_append = False
             for bl in self._abstract_blacklist:
                 if txt.startswith(bl):
                     do_not_append = True
                     break
             if not do_not_append:
                 possible.append(txt)
                 continue
         for st in self._abstract_startswith:
             if txt.startswith(st):
                 possible.append(txt)
                 break
     return self._get_longest_string(possible)
コード例 #12
0
class HTMLSequenceWrapper(object):
    """
    HTMLSequenceWrapper is an itelligent system for pattern and repeating
    sequence recognition on web pages. Input of this algorithm is element tree
    object (lxml.etree._ElementTree) and output is instance of ParsedHTMLDocument.

    The sequencewrapper parses element tree to get most valuable repeating sequence
    which is supposed to be data record. It also finds out regions.
    """

    # list of important terms to get menu.
    _menu = ('[CK]onta[ck]t', 'Publi[ck]', 'Blog', 'Links', 'About', 'Home', 'News?', \
             'Event', 'Research', 'Index', 'FAQ', 'People', 'Overview', 'Profile', \
             'Community', 'Download')

    # this list prolly shouldnt be here, but in some higher class what uses
    # HTMLSequenceWrapper to get page structure and semantics
    _semantic_tags = {
        'dfn': 'Definition Term',  # <dfn>
        'address': 'Address',  # <address>
        'em': 'Emphasis',  # <em>
        'strong': 'Strong Text',  # <strong>
        'ins': 'Inserted',  # <ins>
        'del': 'Delete',  # <del>
        'cite': 'Citation',  # <cite>
        'code': 'Computer code text',  # <code>
        'samp': 'Sample computer code text',  # <samp>
        'kbd': 'Keyboard text',  # <kbd>
        'var': 'Variable'
    }  # <var>

    def __init__(self,
                 childcoef=7.0,
                 headercoef=4.0,
                 mintextlen=10,
                 omitted_tags=('option', 'br', 'select', 'form')):
        self.sequences = {}
        self.childcoef = childcoef
        self.headercoef = headercoef
        self.mintextlen = mintextlen
        self.omitted_tags = omitted_tags

        self.records = []
        self.cleaner = SimpleHTMLCleaner()

    def _append(self, elem, depth):
        if str(elem.tag) == '<built-in function Comment>': return
        key = elem.tag + "_" + str(depth)
        if not key in self.sequences:
            self.sequences[key] = [elem]
        else:
            self.sequences[key].append(elem)

    def _recurse(self, elem, depth):
        self._append(elem, depth)
        for child in elem.iterchildren():
            self._recurse(child, depth + 1)

    def _get_most_freq(self, seqdict, position=1):
        reversed_entries = {}
        for k in seqdict:
            reversed_entries[len(seqdict[k])] = seqdict[k]
        ordered = sorted(reversed_entries.keys(), reverse=True)
        # FILTERING TAGS
        # filter non-usable tags like <option>, <br> or <form>
        for i in range(len(ordered)):
            mf = reversed_entries[ordered[(position - 1) + i]]
            if mf[0].tag not in self.omitted_tags:
                break
        return mf

    def _find_nearest_parent(self, elems):
        parents = {}
        for elem in elems:
            parent = elem.getparent()
            if parent == None: continue
            if parent.tag not in parents:
                parents[parent.tag] = [parent]
            else:
                if not parent in parents[parent.tag]:
                    parents[parent.tag].append(parent)
        mf = self._get_most_freq(parents)
        #del parents
        return mf

    def _isbodyelem(self, elem):
        return elem.tag != None and elem.tag == 'body'

    def _sift(self, elems):
        sift = True
        while sift:
            parents = self._find_nearest_parent(elems)
            if self._isbodyelem(parents[0]): break
            sift = len(elems) < self.childcoef * len(parents)
            if sift: elems = parents
        self.sifted_first = elems[0]
        # improve speed by converting list to set
        try:
            return set(elems)
        except MemoryError:
            return elems

    def _find_regions(self):
        # delete previously found data
        self.regions = []
        area = HTMLSequenceWrapperRegion()
        for elem in self.elemtree.getroot().iterdescendants():
            _style = elem.style
            if _style is None:
                _style = CSSStyle()
            # we consider it to be a header if visibility self.headercoef
            if _style.get_visibility() >= self.headercoef:
                if not area.is_empty():
                    self.regions.append(area)
                    area = HTMLSequenceWrapperRegion()
                area.set_name(self.cleaner.clean(elem.text))
                area.set_header_style(_style)
            if elem in self.found_entries:
                rec = HTMLSequenceWrapperRecord(elem, self.url,
                                                self.mintextlen)
                if not rec.has_value(): continue
                area.add_record(rec)
        if not area.is_empty():
            self.regions.append(area)

    def _find_menu(self, elemtree):
        _anchors = self.elemtree.findall('.//a[@href]')
        menuanchors = []
        for a in _anchors:
            if a.text != None:
                for menuitem in HTMLSequenceWrapper._menu:
                    if re.search(menuitem, a.text, re.I):
                        menuanchors.append(a)
                        break
        if not menuanchors: return
        # sift the menu with a different child coeficient
        coef = self.childcoef
        self.childcoef = 3.0
        _menuitems = self._sift(menuanchors)
        self.childcoef = coef
        # get closest parent for all navigation items
        menu_reg = self._find_nearest_parent(_menuitems)
        _links = menu_reg[0].findall('.//a[@href]')
        for tag in _links:
            if tag == None: continue
            text = self.cleaner.clean(tag.text)
            if text == None: continue
            # bad heuristics, isn't it?
            if len(text) > 50:
                self.menu = {}
                return
            self.doc.add_menu_item(text, tag.get('href'))

    #---------------------------------------------------------------------------
    ## checking unbalanced - wrap_h() methods
    #---------------------------------------------------------------------------
    def _unbalanced_chunk_to_record_ratio(self):
        chunks, records = 0, 0
        for reg in self.regions:
            for rec in reg._manual_process_page():
                records += 1
                chunks += len(rec.get_chunks())
        try:
            return float(chunks) / float(records) < self.unbalanced_chunk_ratio
        except ZeroDivisionError:
            return True

    def _unbalanced_record_to_region_ratio(self):
        try:
            return (float(sum([len(reg._manual_process_page()) for reg in self.regions])) / \
                   float(len(self.regions))) < self.unbalanced_record_ratio
        except ZeroDivisionError:
            return True

    def _high_variablilty_of_chunk_count(self):
        chunks = []
        for reg in self.regions:
            for rec in reg._manual_process_page():
                if sum(chunks) == 0 or len(chunks) == 0 or \
                   len(rec.get_chunks()) > 3*(float(sum(chunks))/float(len(chunks))):
                    chunks.append(len(rec.get_chunks()))
        aver = float(sum(chunks)) / float(len(chunks))
        base = 0
        for x in chunks:
            base += (x - aver)**2
        base /= len(chunks)
        return base > aver

    #---------------------------------------------------------------------------
    # Public methods
    #---------------------------------------------------------------------------

    def wrap_h(self, elemtree, url):
        """
        Heuristical version of wrap() method. Warning: this method doesnt produce
        100% correct result!! And also this method runs longer than wrap() cause
        of repeating parsing sequences.

        TODO: consider clustering methods.
        """
        if not isinstance(elemtree, etree._ElementTree):
            raise TypeError(
                "ElementTree has to be type lxml.etree._ElementTree")
        self.url = url
        self.doc = ParsedHTMLDocument(elemtree, url)
        # parse html document, css on page and in extern *.css files
        # this also makes all links absolute
        self.doc.parse_document()
        # store element tree
        self.elemtree = self.doc.get_etree()

        # recurse over tree
        self._recurse(self.elemtree.getroot(), 1)
        # get most frequented tag
        mf = self._get_most_freq(self.sequences)

        # learn
        satisfying_result_found = False
        # setting up average values of coeficients
        self.childcoef = 7.0
        self.headercoef = 4.0
        self.mintextlen = 40
        self.unbalanced_chunk_ratio = 2.0
        self.unbalanced_record_ratio = 3.0
        iterations = 0
        while not satisfying_result_found:
            iterations += 1
            if iterations > 100: break
            # push it up to get parent tags, they could be record-keepers
            self.found_entries = self._sift(mf)

            # find data regions
            self._find_regions()
            # if we found only one region with one record, its probably a mistake
            # so we have to decrease childcoef
            if len(self.regions) == 1 and len(self.regions[0].records) == 1:
                self.childcoef -= 1.5
                self.headercoef -= 1.0
            elif self._unbalanced_chunk_to_record_ratio():
                self.childcoef += 2.0
                self.headercoef += 0.5
                self.unbalanced_chunk_ratio -= 0.2
            elif self._unbalanced_record_to_region_ratio():
                self.headercoef += 1.0
                self.childcoef -= 0.5
                self.unbalanced_record_ratio -= 0.4
            elif self._high_variablilty_of_chunk_count():
                self.childcoef += 1.0
                self.mintextlen += 10
            else:
                satisfying_result_found = True
        # find navigation on page
        self._find_menu(self.elemtree)

        for reg in self.regions:
            self.doc.add_region(reg)
        #remember last url
        self.last_url = url
        # return parsed document
        return self.doc

    def wrap(self, elemtree, url):
        """
        Main method. Parses html page and searches for repeated sequences in
        element tree. Returns instance of HTMLDocument.
        """
        if not isinstance(elemtree, etree._ElementTree):
            raise TypeError(
                "ElementTree has to be type lxml.etree._ElementTree")
        self.url = url
        self.doc = ParsedHTMLDocument(elemtree, url)
        # parse html document, css on page and in extern *.css files
        # this also makes all links absolute
        self.doc.parse_document()
        # store element tree
        self.elemtree = self.doc.get_etree()
        # recurse over tree
        self._recurse(self.elemtree.getroot(), 1)
        # get most frequented tag
        mf = self._get_most_freq(self.sequences)
        # push it up to get parent tags, they could be record-keepers
        self.found_entries = self._sift(mf)
        # find data regions
        self._find_regions()
        # find navigation on page
        self._find_menu(self.elemtree)

        for reg in self.regions:
            self.doc.add_region(reg)

        try:
            self.doc.set_name(self.elemtree.find('.//title').text)
        except AttributeError:
            pass

        #remember last url
        self.last_url = url

        # return parsed document
        return self.doc

    def _make_xml(self):
        """
        Constructs xml tree containing result of wrapping.
        """
        self.xmldocument = etree.Element("document")
        self.xmldocument.set("base", str(self.doc.get_url()))
        self.xmldocument.set("title", unicode(self.doc.get_name()))

        # add menu if available
        self.xmlmenu = etree.SubElement(self.xmldocument, "menu")
        navigation = self.doc.get_menu()
        for menuitem in navigation:
            menuitemxml = etree.SubElement(self.xmlmenu, "menuitem")
            menuitemxml.text = unicode(menuitem)
            menuitemxml.set("link", unicode(str(navigation[menuitem])))

        # add data regions
        for reg in self.regions:
            self.xmlsequence = etree.SubElement(self.xmldocument,
                                                "sequence-area")
            header = etree.SubElement(self.xmlsequence, "header")
            if reg.get_header_style() != None:
                header.set(
                    "visibility",
                    unicode(str(reg.get_header_style().get_visibility())))
            header.text = unicode(reg.get_name())

            # add records of the region
            for r in reg._manual_process_page():
                item = etree.SubElement(self.xmlsequence, "entry")
                textxml = etree.SubElement(item, "text")
                textxml.text = unicode(r.get_text())
                chunksxml = etree.SubElement(item, "chunks")

                # add chunks
                for chunk in r.get_chunks():
                    chxml = etree.SubElement(chunksxml, "chunk")
                    chxml.text = unicode(chunk.get_text())
                    # show visibility
                    if chunk.get_style() != None:
                        chxml.set(
                            "visibility",
                            unicode(str(chunk.get_style().get_visibility())))
                    if chunk.get_link() != None:
                        chxml.set("link", unicode(chunk.get_link()))
                    # handle tag
                    if chunk.get_tag() != None:
                        tg = chunk.get_tag()
                        if tg in HTMLSequenceWrapper._semantic_tags:
                            tg = HTMLSequenceWrapper._semantic_tags[tg]
                            chxml.set("logical", unicode(str(tg)))
                    # handle comments
                    if chunk.get_comment() != None:
                        try:
                            chxml.set(
                                "comment",
                                unicode(str(chunk.get_comment()),
                                        encoding='utf-8'))
                        except UnicodeEncodeError:
                            try:
                                chxml.set(
                                    "comment",
                                    unicode(chunk.get_comment(),
                                            encoding='utf-8'))
                            except TypeError:
                                chxml.set("comment", chunk.get_comment())

    def get_xml(self):
        """
        Returns xml of result in string format.
        """
        self._make_xml()
        # return the whole xml tree in string format
        return etree.tostring(self.xmldocument,
                              xml_declaration=True,
                              pretty_print=True,
                              encoding='utf-8')

    def get_etree(self):
        """
        Returns xml in lxml.etree.ElementTree object.
        """
        self._make_xml()
        return self.xmldocument
    def _parse_visibility(self, document):
        vis_map = self._get_visibility2elem_map(document.get_etree())
        if len(vis_map) < 2: return
        sorted_vis = sorted(vis_map.keys(), reverse=True)
        if len(sorted_vis) < 2: return
        to_be_processed = None
        while 42: #:)
            to_be_processed = []
            for i in xrange(0, len(sorted_vis)):
                if sorted_vis[i] < self.headercoef: continue
                to_be_processed.extend(vis_map[sorted_vis[i]])
            if len(to_be_processed) < 2:
                self.headercoef -= 0.5
            else: break
        # storage for possible titles
        possible_titles = ListedDict()
        # loop over all headers (elements containing very visible texts)
        for elem in to_be_processed:
            # get cleaned text content of the tag
            txt = SimpleHTMLCleaner.clean( elem.text_content() )
            # generalize: maybe it is something useful
            hdrtext = self.generalizer.generalize(txt)
            # generalization found header beeing TITLE -> data are below header
            if hdrtext is not None:
                # found some useful header, try to get data below
                # what is below? probably sibling tags and their descendants
                self._get_data_below_header(elem, hdrtext, to_be_processed)
            # generalization wasnt successful -> maybe the header contains data
            else:
                # date?
                d = self.ee.find_published_date(txt)
                if d[0]:
                    rrsdate = d[0][0]
                    for attr in ('year', 'month'):
                        if rrsdate.get(attr) is not None:
                            self._publ[attr] = rrsdate.get(attr)
                    txt = d[1]
                # maybe title
                if len(txt.split(" ")) > 3: # probably more than three words
                    # is there a domain name in the title? So it is probably
                    # general name of the website
                    if len(self.domain) > 6 and re.search(re.escape(self.domain), txt, re.I):
                        continue

                    # preprocessing - remove standalone brackets
                    txt = re.sub("[\(\[][^\)\]]*[\)\]]+", "", txt).strip()
                    if document.name is not None and re.search(re.escape(txt), document.name, re.I):
                        possible_titles[int(self._classify_publ_title(txt, init=100))] = txt
                    elif len(txt.split(" ")) > 5:
                        possible_titles[int(self._classify_publ_title(txt, init=60))] = txt
        if possible_titles:
            titles = possible_titles[max(possible_titles)]
            if len(titles) > 1:
                title = self._get_longest_string(titles)
            else:
                title = titles[0]
            self._publ['title'] = title
            self._publ['credibility'] = max(possible_titles)
        else:
            self._publ['credibility'] = 0
        # store all new properties and their values
        for property in self._storage:
            self._add_property(property, self._storage[property])
class HTMLSequenceWrapper(object):
    """
    HTMLSequenceWrapper is an itelligent system for pattern and repeating
    sequence recognition on web pages. Input of this algorithm is element tree
    object (lxml.etree._ElementTree) and output is instance of ParsedHTMLDocument.

    The sequencewrapper parses element tree to get most valuable repeating sequence
    which is supposed to be data record. It also finds out regions.
    """

    # list of important terms to get menu.
    _menu = ('[CK]onta[ck]t', 'Publi[ck]', 'Blog', 'Links', 'About', 'Home', 'News?', \
             'Event', 'Research', 'Index', 'FAQ', 'People', 'Overview', 'Profile', \
             'Community', 'Download')

    # this list prolly shouldnt be here, but in some higher class what uses
    # HTMLSequenceWrapper to get page structure and semantics
    _semantic_tags = {'dfn': 'Definition Term',            # <dfn>
                      'address': 'Address',                # <address>
                      'em': 'Emphasis',                    # <em>
                      'strong': 'Strong Text',             # <strong>
                      'ins': 'Inserted',                   # <ins>
                      'del': 'Delete',                     # <del>
                      'cite': 'Citation',                  # <cite>
                      'code': 'Computer code text',        # <code>
                      'samp': 'Sample computer code text', # <samp>
                      'kbd': 'Keyboard text',              # <kbd>
                      'var' : 'Variable'}                  # <var>

    def __init__(self, childcoef=7.0, headercoef=4.0, mintextlen=10, omitted_tags=('option', 'br', 'select', 'form')):
        self.sequences = {}
        self.childcoef = childcoef
        self.headercoef = headercoef
        self.mintextlen = mintextlen
        self.omitted_tags = omitted_tags

        self.records = []
        self.cleaner = SimpleHTMLCleaner()


    def _append(self, elem, depth):
        if str(elem.tag) == '<built-in function Comment>': return
        key = elem.tag + "_" + str(depth)
        if not key in self.sequences:
            self.sequences[key] = [elem]
        else:
            self.sequences[key].append(elem)


    def _recurse(self, elem, depth):
        self._append(elem, depth)
        for child in elem.iterchildren():
            self._recurse(child, depth+1)


    def _get_most_freq(self, seqdict, position=1):
        reversed_entries = {}
        for k in seqdict:
            reversed_entries[len(seqdict[k])] = seqdict[k]
        ordered = sorted(reversed_entries.keys(), reverse=True)
        # FILTERING TAGS
        # filter non-usable tags like <option>, <br> or <form>
        for i in range(len(ordered)):
            mf = reversed_entries[ordered[(position-1)+i]]
            if mf[0].tag not in self.omitted_tags:
                break
        return mf


    def _find_nearest_parent(self, elems):
        parents = {}
        for elem in elems:
            parent = elem.getparent()
            if parent == None: continue
            if parent.tag not in parents:
                parents[parent.tag] = [parent]
            else:
                if not parent in parents[parent.tag]:
                    parents[parent.tag].append(parent)
        mf = self._get_most_freq(parents)
        #del parents
        return mf


    def _isbodyelem(self, elem):
        return elem.tag != None and elem.tag == 'body'


    def _sift(self, elems):
        sift = True
        while sift:
            parents = self._find_nearest_parent(elems)
            if self._isbodyelem(parents[0]): break
            sift = len(elems) < self.childcoef * len(parents)
            if sift: elems = parents
        self.sifted_first = elems[0]
        # improve speed by converting list to set
        try:
            return set(elems)
        except MemoryError:
            return elems


    def _find_regions(self):
        # delete previously found data
        self.regions = []
        area = HTMLSequenceWrapperRegion()
        for elem in self.elemtree.getroot().iterdescendants():
            _style = elem.style
            if _style is None:
                _style = CSSStyle()
            # we consider it to be a header if visibility self.headercoef
            if _style.get_visibility() >= self.headercoef:
                if not area.is_empty():
                    self.regions.append(area)
                    area = HTMLSequenceWrapperRegion()
                area.set_name(self.cleaner.clean(elem.text))
                area.set_header_style(_style)
            if elem in self.found_entries:
                rec = HTMLSequenceWrapperRecord(elem, self.url, self.mintextlen)
                if not rec.has_value(): continue
                area.add_record(rec)
        if not area.is_empty():
            self.regions.append(area)


    def _find_menu(self, elemtree):
        _anchors = self.elemtree.findall('.//a[@href]')
        menuanchors = []
        for a in _anchors:
            if a.text != None:
                for menuitem in HTMLSequenceWrapper._menu:
                    if re.search(menuitem, a.text, re.I):
                        menuanchors.append(a)
                        break
        if not menuanchors: return
        # sift the menu with a different child coeficient
        coef = self.childcoef
        self.childcoef = 3.0
        _menuitems = self._sift(menuanchors)
        self.childcoef = coef
        # get closest parent for all navigation items
        menu_reg = self._find_nearest_parent(_menuitems)
        _links = menu_reg[0].findall('.//a[@href]')
        for tag in _links:
            if tag == None: continue
            text = self.cleaner.clean(tag.text)
            if text == None: continue
            # bad heuristics, isn't it?
            if len(text) > 50:
                self.menu = {}
                return
            self.doc.add_menu_item(text, tag.get('href'))


    #---------------------------------------------------------------------------
    ## checking unbalanced - wrap_h() methods
    #---------------------------------------------------------------------------
    def _unbalanced_chunk_to_record_ratio(self):
        chunks, records = 0, 0
        for reg in self.regions:
            for rec in reg._manual_process_page():
                records += 1
                chunks += len(rec.get_chunks())
        try:
            return float(chunks)/float(records) < self.unbalanced_chunk_ratio
        except ZeroDivisionError:
            return True


    def _unbalanced_record_to_region_ratio(self):
        try:
            return (float(sum([len(reg._manual_process_page()) for reg in self.regions])) / \
                   float(len(self.regions))) < self.unbalanced_record_ratio
        except ZeroDivisionError:
            return True


    def _high_variablilty_of_chunk_count(self):
        chunks = []
        for reg in self.regions:
            for rec in reg._manual_process_page():
                if sum(chunks) == 0 or len(chunks) == 0 or \
                   len(rec.get_chunks()) > 3*(float(sum(chunks))/float(len(chunks))):
                    chunks.append( len(rec.get_chunks()) )
        aver = float(sum(chunks))/float(len(chunks))
        base = 0
        for x in chunks:
            base += (x - aver)**2
        base /= len(chunks)
        return base > aver


    #---------------------------------------------------------------------------
    # Public methods
    #---------------------------------------------------------------------------

    def wrap_h(self, elemtree, url):
        """
        Heuristical version of wrap() method. Warning: this method doesnt produce
        100% correct result!! And also this method runs longer than wrap() cause
        of repeating parsing sequences.

        TODO: consider clustering methods.
        """
        if not isinstance(elemtree, etree._ElementTree):
            raise TypeError("ElementTree has to be type lxml.etree._ElementTree")
        self.url = url
        self.doc = ParsedHTMLDocument(elemtree, url)
        # parse html document, css on page and in extern *.css files
        # this also makes all links absolute
        self.doc.parse_document()
        # store element tree
        self.elemtree = self.doc.get_etree()

        # recurse over tree
        self._recurse(self.elemtree.getroot(), 1)
        # get most frequented tag
        mf = self._get_most_freq(self.sequences)

        # learn
        satisfying_result_found = False
        # setting up average values of coeficients
        self.childcoef = 7.0
        self.headercoef = 4.0
        self.mintextlen = 40
        self.unbalanced_chunk_ratio = 2.0
        self.unbalanced_record_ratio = 3.0
        iterations = 0
        while not satisfying_result_found:
            iterations += 1
            if iterations > 100: break
            # push it up to get parent tags, they could be record-keepers
            self.found_entries = self._sift(mf)

            # find data regions
            self._find_regions()
            # if we found only one region with one record, its probably a mistake
            # so we have to decrease childcoef
            if len(self.regions) == 1 and len(self.regions[0].records) == 1:
                self.childcoef -= 1.5
                self.headercoef -= 1.0
            elif self._unbalanced_chunk_to_record_ratio():
                self.childcoef += 2.0
                self.headercoef += 0.5
                self.unbalanced_chunk_ratio -= 0.2
            elif self._unbalanced_record_to_region_ratio():
                self.headercoef += 1.0
                self.childcoef -= 0.5
                self.unbalanced_record_ratio -= 0.4
            elif self._high_variablilty_of_chunk_count():
                self.childcoef += 1.0
                self.mintextlen += 10
            else:
                satisfying_result_found = True
        # find navigation on page
        self._find_menu(self.elemtree)

        for reg in self.regions:
            self.doc.add_region(reg)
        #remember last url
        self.last_url = url
        # return parsed document
        return self.doc


    def wrap(self, elemtree, url):
        """
        Main method. Parses html page and searches for repeated sequences in
        element tree. Returns instance of HTMLDocument.
        """
        if not isinstance(elemtree, etree._ElementTree):
            raise TypeError("ElementTree has to be type lxml.etree._ElementTree")
        self.url = url
        self.doc = ParsedHTMLDocument(elemtree, url)
        # parse html document, css on page and in extern *.css files
        # this also makes all links absolute
        self.doc.parse_document()
        # store element tree
        self.elemtree = self.doc.get_etree()
        # recurse over tree
        self._recurse(self.elemtree.getroot(), 1)
        # get most frequented tag
        mf = self._get_most_freq(self.sequences)
        # push it up to get parent tags, they could be record-keepers
        self.found_entries = self._sift(mf)
        # find data regions
        self._find_regions()
        # find navigation on page
        self._find_menu(self.elemtree)

        for reg in self.regions:
            self.doc.add_region(reg)

        try:
          self.doc.set_name(self.elemtree.find('.//title').text)
        except AttributeError:
          pass

        #remember last url
        self.last_url = url

        # return parsed document
        return self.doc


    def _make_xml(self):
        """
        Constructs xml tree containing result of wrapping.
        """
        self.xmldocument = etree.Element("document")
        self.xmldocument.set("base", str(self.doc.get_url()))
        self.xmldocument.set("title", unicode(self.doc.get_name()))

        # add menu if available
        self.xmlmenu = etree.SubElement(self.xmldocument, "menu")
        navigation = self.doc.get_menu()
        for menuitem in navigation:
            menuitemxml = etree.SubElement(self.xmlmenu, "menuitem")
            menuitemxml.text = unicode(menuitem)
            menuitemxml.set("link", unicode(str(navigation[menuitem])))

        # add data regions
        for reg in self.regions:
            self.xmlsequence = etree.SubElement(self.xmldocument, "sequence-area")
            header = etree.SubElement(self.xmlsequence, "header")
            if reg.get_header_style() != None:
                header.set("visibility", unicode(str(reg.get_header_style().get_visibility())))
            header.text = unicode(reg.get_name())

            # add records of the region
            for r in reg._manual_process_page():
                item = etree.SubElement(self.xmlsequence, "entry")
                textxml = etree.SubElement(item, "text")
                textxml.text = unicode(r.get_text())
                chunksxml = etree.SubElement(item, "chunks")

                # add chunks
                for chunk in r.get_chunks():
                    chxml = etree.SubElement(chunksxml, "chunk")
                    chxml.text = unicode(chunk.get_text())
                    # show visibility
                    if chunk.get_style() != None:
                        chxml.set("visibility", unicode(str(chunk.get_style().get_visibility())))
                    if chunk.get_link() != None:
                        chxml.set("link", unicode(chunk.get_link()))
                    # handle tag
                    if chunk.get_tag() != None:
                        tg = chunk.get_tag()
                        if tg in HTMLSequenceWrapper._semantic_tags:
                            tg = HTMLSequenceWrapper._semantic_tags[tg]
                            chxml.set("logical", unicode(str(tg)))
                    # handle comments
                    if chunk.get_comment() != None:
                        try:
                            chxml.set("comment", unicode(str(chunk.get_comment()), encoding='utf-8'))
                        except UnicodeEncodeError:
                            try:
                                chxml.set("comment", unicode(chunk.get_comment(), encoding='utf-8'))
                            except TypeError:
                                chxml.set("comment", chunk.get_comment())



    def get_xml(self):
        """
        Returns xml of result in string format.
        """
        self._make_xml()
        # return the whole xml tree in string format
        return etree.tostring(self.xmldocument,
                              xml_declaration=True,
                              pretty_print=True,
                              encoding='utf-8')

    def get_etree(self):
        """
        Returns xml in lxml.etree.ElementTree object.
        """
        self._make_xml()
        return self.xmldocument
class HTMLSequenceWrapperRecord(object):
    def __init__(self, element, url, mintextlen=10):
        self.cleaner = SimpleHTMLCleaner()
        self.mintextlen = mintextlen
        self.elem = element
        self.url = url

        # the whole text
        self.text = self.elem.text_content()
        self.text = self.cleaner.clean(self.text)

        self.chunks = []
        self.__extract_chunks(self.elem)


    def has_value(self):
        if self.cleaner.contains_text(self.text) == False:
            return False
        return len(self.text) > self.mintextlen


    def get_chunks(self):
        return self.chunks


    def get_text(self):
        return self.text


    def _handle_elem(self, elem):
        if elem.text == None: return None
        if not self.cleaner.contains_text(elem): return None
        # new chunk
        chunk = TextChunk()

        ## extracting links
        if elem.get('href') != None:
            chunk.set_link(elem.get('href'))
        # extracting 'title' atribute in anchor
        if elem.tag == 'a' and elem.get('title') != None:
            chunk.set_comment(elem.get('title'))

        # extracting text
        txt = elem.text_content()
        chunk.set_text(self.cleaner.clean(txt))

        # setting style
        fs = elem.style
        chunk.set_style(fs)
        chunk.set_tag(elem.tag)
        return chunk


    def __extract_chunks(self, elem):
        thischunk = self._handle_elem(elem)
        if thischunk != None:
            self.chunks.append(thischunk)
        for child in elem.iterchildren():
            self.__extract_chunks(child)


    def __str__(self):
        return "<"+__modulename__+".HTMLSequenceWrapperRecord instance " + self.text + " >"
    def _get_data_below_header(self, elem, hdrtext, to_be_processed):
        # Try to iter over siblings of the header element and get text
        siblings = [sib.tag for sib in elem.itersiblings()]
        # the header is abstract
        if hdrtext == 'abstract':
            txts = {}
            paragraphs = []
            par_stop = False
            for sib in elem.itersiblings():
                content = sib.text_content()
                if sib in to_be_processed:
                    par_stop = True
                if sib.tag == 'p' and len(content) > 50 and not par_stop:
                    paragraphs.append(content)
                chunk = content[0:20].lower()
                score = 1.0
                for st in self._abstract_startswith:
                    if chunk.startswith(st): score*=5.0
                score *= len(content)
                txts[score] = SimpleHTMLCleaner.clean(content)
            if paragraphs:
                self._storage[hdrtext] = [SimpleHTMLCleaner.clean(" ".join(paragraphs))]
            else:
                self._storage[hdrtext] = [ txts[max(txts.keys())] ]

        # related publications
        elif hdrtext == 'related':
            list_tags = ('ul', 'ol', 'dl')
            return # TODO
            for ltag in list_tags:
                if ltag in siblings:
                    for sib in elem.itersiblings(): pass

        # keywords
        elif hdrtext == 'keywords':
            # create function returning elements containing possible keywords
            is_keyword = lambda kw: re.search("^(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}$", \
                                    kw.text_content(), re.I) \
                                    and not re.search("[@#\$%\^&\*\(\)]", kw.text_content())
            # iter over siblings of header a try to get keywords from its children
            likelihood_to_keyword_tags = ListedDict()
            for s in elem.itersiblings():
                (kw_elems, likelihood) = self._find_local_sequence(s, is_keyword)
                if kw_elems is None: continue
                likelihood_to_keyword_tags[likelihood] = kw_elems
            if not likelihood_to_keyword_tags: return
            # if found some keywords, store them
            self._storage[hdrtext] = [kw.text_content() for kw in likelihood_to_keyword_tags[max(likelihood_to_keyword_tags.keys())][0]]

        # references
        elif hdrtext == 'references':
            pass # TODO

        # chapters ??
        elif hdrtext == 'chapters':
            pass # TODO

        # reviews?
        elif hdrtext == 'reviews':
            if hdrtext in self._storage: return
            # create function returning elements containing possible reviews
            is_review = lambda r: (len(r.text_content()) > 100) or r.tag == 'blockquote'
            probability = ListedDict()
            # iter over siblings of header a try to get reviews from its children
            for s in elem.itersiblings():
                (elems, prob) = self._find_local_sequence(s, is_review)
                if elems is None: continue
                probability[prob] = elems
            review_texts = []
            if not probability: return
            for e in probability[max(probability.keys())][0]:
                review_texts.append(SimpleHTMLCleaner.clean(e.text_content()))
                # set all the elements as "processed" to avoid further processing
                for d in e.iter():
                    d.processed = True
            self._storage[hdrtext] = review_texts
コード例 #17
0
    def _parse_visibility(self, document):
        vis_map = self._get_visibility2elem_map(document.get_etree())
        if len(vis_map) < 2: return
        sorted_vis = sorted(vis_map.keys(), reverse=True)
        if len(sorted_vis) < 2: return
        to_be_processed = None
        while 42:  #:)
            to_be_processed = []
            for i in xrange(0, len(sorted_vis)):
                if sorted_vis[i] < self.headercoef: continue
                to_be_processed.extend(vis_map[sorted_vis[i]])
            if len(to_be_processed) < 2:
                self.headercoef -= 0.5
            else:
                break
        # storage for possible titles
        possible_titles = ListedDict()
        # loop over all headers (elements containing very visible texts)
        for elem in to_be_processed:
            # get cleaned text content of the tag
            txt = SimpleHTMLCleaner.clean(elem.text_content())
            # generalize: maybe it is something useful
            hdrtext = self.generalizer.generalize(txt)
            # generalization found header beeing TITLE -> data are below header
            if hdrtext is not None:
                # found some useful header, try to get data below
                # what is below? probably sibling tags and their descendants
                self._get_data_below_header(elem, hdrtext, to_be_processed)
            # generalization wasnt successful -> maybe the header contains data
            else:
                # date?
                d = self.ee.find_published_date(txt)
                if d[0]:
                    rrsdate = d[0][0]
                    for attr in ('year', 'month'):
                        if rrsdate.get(attr) is not None:
                            self._publ[attr] = rrsdate.get(attr)
                    txt = d[1]
                # maybe title
                if len(txt.split(" ")) > 3:  # probably more than three words
                    # is there a domain name in the title? So it is probably
                    # general name of the website
                    if len(self.domain) > 6 and re.search(
                            re.escape(self.domain), txt, re.I):
                        continue

                    # preprocessing - remove standalone brackets
                    txt = re.sub("[\(\[][^\)\]]*[\)\]]+", "", txt).strip()
                    if document.name is not None and re.search(
                            re.escape(txt), document.name, re.I):
                        possible_titles[int(
                            self._classify_publ_title(txt, init=100))] = txt
                    elif len(txt.split(" ")) > 5:
                        possible_titles[int(
                            self._classify_publ_title(txt, init=60))] = txt
        if possible_titles:
            titles = possible_titles[max(possible_titles)]
            if len(titles) > 1:
                title = self._get_longest_string(titles)
            else:
                title = titles[0]
            self._publ['title'] = title
            self._publ['credibility'] = max(possible_titles)
        else:
            self._publ['credibility'] = 0
        # store all new properties and their values
        for property in self._storage:
            self._add_property(property, self._storage[property])
コード例 #18
0
    def _get_data_below_header(self, elem, hdrtext, to_be_processed):
        # Try to iter over siblings of the header element and get text
        siblings = [sib.tag for sib in elem.itersiblings()]
        # the header is abstract
        if hdrtext == 'abstract':
            txts = {}
            paragraphs = []
            par_stop = False
            for sib in elem.itersiblings():
                content = sib.text_content()
                if sib in to_be_processed:
                    par_stop = True
                if sib.tag == 'p' and len(content) > 50 and not par_stop:
                    paragraphs.append(content)
                chunk = content[0:20].lower()
                score = 1.0
                for st in self._abstract_startswith:
                    if chunk.startswith(st): score *= 5.0
                score *= len(content)
                txts[score] = SimpleHTMLCleaner.clean(content)
            if paragraphs:
                self._storage[hdrtext] = [
                    SimpleHTMLCleaner.clean(" ".join(paragraphs))
                ]
            else:
                self._storage[hdrtext] = [txts[max(txts.keys())]]

        # related publications
        elif hdrtext == 'related':
            list_tags = ('ul', 'ol', 'dl')
            return  # TODO
            for ltag in list_tags:
                if ltag in siblings:
                    for sib in elem.itersiblings():
                        pass

        # keywords
        elif hdrtext == 'keywords':
            # create function returning elements containing possible keywords
            is_keyword = lambda kw: re.search("^(([a-z]{3,}( |,)){1,3} ?)+([a-z]{3,} ?){1,3}$", \
                                    kw.text_content(), re.I) \
                                    and not re.search("[@#\$%\^&\*\(\)]", kw.text_content())
            # iter over siblings of header a try to get keywords from its children
            likelihood_to_keyword_tags = ListedDict()
            for s in elem.itersiblings():
                (kw_elems,
                 likelihood) = self._find_local_sequence(s, is_keyword)
                if kw_elems is None: continue
                likelihood_to_keyword_tags[likelihood] = kw_elems
            if not likelihood_to_keyword_tags: return
            # if found some keywords, store them
            self._storage[hdrtext] = [
                kw.text_content() for kw in likelihood_to_keyword_tags[max(
                    likelihood_to_keyword_tags.keys())][0]
            ]

        # references
        elif hdrtext == 'references':
            pass  # TODO

        # chapters ??
        elif hdrtext == 'chapters':
            pass  # TODO

        # reviews?
        elif hdrtext == 'reviews':
            if hdrtext in self._storage: return
            # create function returning elements containing possible reviews
            is_review = lambda r: (len(r.text_content()) > 100
                                   ) or r.tag == 'blockquote'
            probability = ListedDict()
            # iter over siblings of header a try to get reviews from its children
            for s in elem.itersiblings():
                (elems, prob) = self._find_local_sequence(s, is_review)
                if elems is None: continue
                probability[prob] = elems
            review_texts = []
            if not probability: return
            for e in probability[max(probability.keys())][0]:
                review_texts.append(SimpleHTMLCleaner.clean(e.text_content()))
                # set all the elements as "processed" to avoid further processing
                for d in e.iter():
                    d.processed = True
            self._storage[hdrtext] = review_texts