Esempi in Python per GetHTMLAndParse.get_charset

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: gethtmlandparse

Classe/tipologia: GetHTMLAndParse

Metodo/funzione: get_charset

Esempi su hotexamples.com: 2

GetHTMLAndParse.get_charset in Python: 2 esempi trovati. Questi sono i migliori esempi reali in Python per gethtmlandparse.GetHTMLAndParse.get_charset, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

GetHTMLAndParse(5)

find_anchor_elem(3)

get_etree(2)

ghap(2)

is_wanted_mime(2)

check_file(1)

compare_domains(1)

count_all_headers(1)

get_all_links(1)

get_anchor_from_link(1)

get_charset(1)

get_domain_name(1)

get_pager_links(1)

is_page(1)

look_for_frame(1)

Esempio n. 1

Mostra file

File: getdeliverablerecords.py Progetto: KNOT-GIT/mDeliverables

class GetDeliverableRegion:

    def __init__(self):
        # init agent for parsing html
        self.htmlHandler = GetHTMLAndParse()
        
        # format text
        self.formatter = TextFormatUtils()

    
    
    def get_region(self, url, base, tolerance):
        """ Get data region. 
        Returns element tree with region where are deliverables stored """
        (gresult, errmsg) = self.htmlHandler.ghap(url)
        if gresult == -1:
            return derrno.__err__(errmsg)
        
        # initialize charset to encode the page
        self.formatter.set_charset(self.htmlHandler.get_charset())
        # get anchors carying link to deliverable <a href="./deliverable.pdf">
        deliv_elements = self.htmlHandler.find_anchor_elem(base=base)
        if len(deliv_elements) == 0:
            return derrno.__err__(derrno.ENODOC, url)
        if len(deliv_elements) == 1:
            return lxml.etree.ElementTree(deliv_elements[0])

        # get parent tag of all deliverable anchors
        parent_element = self._get_common_parent(deliv_elements, tolerance)
        if parent_element == None:
            return derrno.__err__(derrno.ENOREG, "Parent element not found.")
      
        # get the region out of the parent element
        region = self._get_deliverable_region(parent_element)
        # if parent tag is region
        if region == 0:
            # return element tree made from parent tag
            return lxml.etree.ElementTree(parent_element)
        else:
            print
            #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True)
        return region # else return region
        

    
    def _get_common_parent(self, elem_list, tolerance):
        """ Stabile searching parent of all elements in elem_list 
        using method of making element parent vectors and comparing them.
        Tolerance of n tags makes the region smaller if there are 
        >>not deliverable<< pdfs in more regions on the page."""      

        
        def _minlength(seq_list):
            """ supporting method - kind of bigger lambda. Get minimal length of
            inside lists."""
            return min([len(seq) for seq in seq_list])
        
        
        def _iscommon(elem_seq, tol):
            """ next supporting method: check the elements in list.
            if elements are the same, its common parent tag - return True."""
            tol_list = []
            for elem in elem_seq:
                if not elem in tol_list:
                    tol_list.append(elem)
            if len(tol_list) > tol+1:
                return False
            # if only two anchors found then we have only two tags 
            # and its pretty hard to use tolerance, so we omit it.
            if len(elem_seq) < 3 and len(tol_list) > 1:
                return False
            return True
        
        
        def _most_frequent(seq):
            """ get the most frequenced tag in list """
            suplist = []
            suplist_freq = []
            for el in seq:
                if not el in suplist:
                    suplist.append(el)
                    suplist_freq.append(int(1))
                else:
                    suplist_freq[suplist.index(el)] += 1
            ind = suplist_freq.index(max(suplist_freq))
            return suplist[ind]
        
        #
        # now continue with method _get_common_parent()
        #
        vectors = [] # here will be vectors stored - list of lists
        for self.elem in elem_list:
            _vector = []
            while 1:
                parent = self.elem.getparent() # exception possible here
                if parent == None:
                    break
                _vector.append(parent)
                self.elem = parent
            vectors.append(_vector)
        # We have parent vectors of all elements from elem_list stored in list
        # $vectors. Then zip the vector list and get sequences of parent tags (and the 
        # other tags) sorted from the highest to the lowest parent element.
        zipped = [[row[-i] for row in vectors] for i in range(1, _minlength(vectors)+1)]
        # now check all lists in list zipped. If these are filled with the same 
        # elements, its a common parent. The last list before difference contains 
        # the main parent tag.
        self.last_seq = []
        for zipvect in zipped:
            if not _iscommon(zipvect, tolerance):
                # return most frequented element in last vector
                return _most_frequent(self.last_seq)
            self.last_seq = zipvect
        return _most_frequent(self.last_seq)


    
    def _get_element_texts(self, elem, string=True):
        """ Get texts from element and his descendants.
        If string is True, returns texts as one string with spaces.
        elem: lxml element """
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts


    
    def _get_deliverable_region(self, parent_tag):
        """ Get deliverable region - returns etree with region.
        If 0 returned parent_tag is region,
        if -1 returned some error occured searching,
        if html string returned its a region. """
        def _convert_tag_to_html(tag):
           tag_html = lxml.etree.ElementTree(tag)
           return lxml.etree.tostring(tag_html)
        
        # list[0] = type, list[1] = atribute, list[2] = lxml tag element
        # in case of headers list[0] = element.tag, then [2] is element
        _reg_atr = ['',None,None]
        self._result_html_region = ''
        reg_flag = False # flag indicating that we are looping over region
        # get headers first
        headers = []
        #lxml.etree.ElementTree(parent_tag).write(sys.stdout,pretty_print=True)
        for i in range(1,7):
            headers.extend(parent_tag.findall('.//h'+str(i)))
        children = parent_tag.getchildren()
        if len(headers) > 0:
            for head in headers:
                text = self._get_element_texts(head)
                if text:
                    if re.search("deliverables", text, re.I):
                        _reg_atr[0] = head.tag
                        _reg_atr[2] = head
                        break
            if _reg_atr[2] == None:
                return 0
            # visit all tag in parent_tag                
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'img': continue;
                text = self._get_element_texts(tag)       
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        text = tag.find('img').tail
                    else:
                        text = ' '           
                if text:
                    if re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # next similar title, END of region
                        if reg_flag:
                           break
                # region content
                if tag in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region+=_convert_tag_to_html(tag.getparent()) 
                    children.remove(tag.getparent())                    
        # if we dont have headers, try to find other kind of header (title)
        # "Deliverables" and compare with other elements with the same class or id.
        else:
            for tag in parent_tag.iter():
                if tag.text:
                    if re.search("deliverables", tag.text, re.I):
                        if tag.get("class"):
                            _reg_atr[0] = 'class'
                            _reg_atr[1] = tag.get("class")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("id"):
                            _reg_atr[0] = 'id'
                            _reg_atr[1] = tag.get("id")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("style"):
                            _reg_atr[0] = 'style'
                            _reg_atr[1] = tag.get("style")
                            _reg_atr[2] = tag
                            break
            
            # test _reg_atr. If there is no deliverable region, then all 
            # documents make the region
            if _reg_atr[2] == None:
                return 0
            reg_flag = False
            # visit all tag in parent_tag
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        tag.text = tag.find('img').tail
                    else:
                        tag.text = ' '                 
                if tag.text:
                    if re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # next similar title, END of region
                        if reg_flag:
                            break
                # region content
                if tag in children and reg_flag:                             
                    self._result_html_region += _convert_tag_to_html(tag)  
                    children.remove(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region+=_convert_tag_to_html(tag.getparent()) 
                    children.remove(tag.getparent())
        if not self._result_html_region:
            return 0
        # create ElementTree from region
        try:
            return lxml.etree.fromstring(self._result_html_region)
        except:
            try:
                parser = lxml.etree.HTMLParser()
                return lxml.etree.fromstring(self._result_html_region, parser)
            except lxml.etree.XMLSyntaxError:
                return 0

Esempio n. 2

Mostra file

File: getdeliverablerecords.py Progetto: xkolac11/Deliverables

class GetDeliverableRegion:
    def __init__(self):
        # init agent for parsing html
        self.htmlHandler = GetHTMLAndParse()

        # format text
        self.formatter = TextFormatUtils()

    """ Get data region. 
    Returns element tree with region where are deliverables stored """

    def get_region(self, url, base, tolerance):
        (gresult, errmsg) = self.htmlHandler.ghap(url)
        if gresult == -1:
            return derrno.__err__(errmsg)

        # initialize charset to encode the page
        self.formatter.set_charset(self.htmlHandler.get_charset())
        # get anchors carying link to deliverable <a href="./deliverable.pdf">
        deliv_elements = self.htmlHandler.find_anchor_elem(base=base)
        if len(deliv_elements) == 0:
            return derrno.__err__(derrno.ENODOC, url)
        if len(deliv_elements) == 1:
            return lxml.etree.ElementTree(deliv_elements[0])

        # get parent tag of all deliverable anchors
        parent_element = self._get_common_parent(deliv_elements, tolerance)
        if parent_element == None:
            return derrno.__err__(derrno.ENOREG, "Parent element not found.")

        # get the region out of the parent element
        region = self._get_deliverable_region(parent_element)
        # if parent tag is region
        if region == 0:
            # return element tree made from parent tag
            return lxml.etree.ElementTree(parent_element)
        else:
            print
            #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True)
        return region  # else return region

    """ Stabile searching parent of all elements in elem_list 
    using method of making element parent vectors and comparing them.
    Tolerance of n tags makes the region smaller if there are 
    >>not deliverable<< pdfs in more regions on the page."""

    def _get_common_parent(self, elem_list, tolerance):

        # supporting method - kind of bigger lambda. Get minimal length of
        # inside lists.
        def _minlength(seq_list):
            return min([len(seq) for seq in seq_list])

        # next supporting method: check the elements in list.
        # if elements are the same, its common parent tag - return True.
        def _iscommon(elem_seq, tol):
            tol_list = []
            for elem in elem_seq:
                if not elem in tol_list:
                    tol_list.append(elem)
            if len(tol_list) > tol + 1:
                return False
            # if only two anchors found then we have only two tags
            # and its pretty hard to use tolerance, so we omit it.
            if len(elem_seq) < 3 and len(tol_list) > 1:
                return False
            return True

        # get the most frequenced tag in list
        def _most_frequent(seq):
            suplist = []
            suplist_freq = []
            for el in seq:
                if not el in suplist:
                    suplist.append(el)
                    suplist_freq.append(int(1))
                else:
                    suplist_freq[suplist.index(el)] += 1
            ind = suplist_freq.index(max(suplist_freq))
            return suplist[ind]

        #
        # now continue with method _get_common_parent()
        #
        vectors = []  # here will be vectors stored - list of lists
        for self.elem in elem_list:
            _vector = []
            while 1:
                parent = self.elem.getparent()  # exception possible here
                if parent == None:
                    break
                _vector.append(parent)
                self.elem = parent
            vectors.append(_vector)
        # We have parent vectors of all elements from elem_list stored in list
        # $vectors. Then zip the vector list and get sequences of parent tags (and the
        # other tags) sorted from the highest to the lowest parent element.
        zipped = [[row[-i] for row in vectors]
                  for i in range(1,
                                 _minlength(vectors) + 1)]
        # now check all lists in list zipped. If these are filled with the same
        # elements, its a common parent. The last list before difference contains
        # the main parent tag.
        self.last_seq = []
        for zipvect in zipped:
            if not _iscommon(zipvect, tolerance):
                # return most frequented element in last vector
                return _most_frequent(self.last_seq)
            self.last_seq = zipvect
        return _most_frequent(self.last_seq)

    """ Get texts from element and his descendants.
    If string is True, returns texts as one string with spaces.
    elem: lxml element """

    def _get_element_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts

    """ Get deliverable region - returns etree with region.
     If 0 returned parent_tag is region,
     if -1 returned some error occured searching,
     if html string returned its a region. """

    def _get_deliverable_region(self, parent_tag):
        def _convert_tag_to_html(tag):
            tag_html = lxml.etree.ElementTree(tag)
            return lxml.etree.tostring(tag_html)

        # list[0] = type, list[1] = atribute, list[2] = lxml tag element
        # in case of headers list[0] = element.tag, then [2] is element
        _reg_atr = ['', None, None]
        self._result_html_region = ''
        reg_flag = False  # flag indicating that we are looping over region
        # get headers first
        headers = []
        #lxml.etree.ElementTree(parent_tag).write(sys.stdout,pretty_print=True)
        for i in range(1, 7):
            headers.extend(parent_tag.findall('.//h' + str(i)))
        children = parent_tag.getchildren()
        if len(headers) > 0:
            for head in headers:
                text = self._get_element_texts(head)
                if text:
                    if re.search("deliverables", text, re.I):
                        _reg_atr[0] = head.tag
                        _reg_atr[2] = head
                        break
            if _reg_atr[2] == None:
                return 0
            # visit all tag in parent_tag
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'img': continue
                text = self._get_element_texts(tag)
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        text = tag.find('img').tail
                    else:
                        text = ' '
                if text:
                    if re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # next similar title, END of region
                        if reg_flag:
                            break
                # region content
                if tag in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(
                        tag.getparent())
                    children.remove(tag.getparent())
        # if we dont have headers, try to find other kind of header (title)
        # "Deliverables" and compare with other elements with the same class or id.
        else:
            for tag in parent_tag.iter():
                if tag.text:
                    if re.search("deliverables", tag.text, re.I):
                        if tag.get("class"):
                            _reg_atr[0] = 'class'
                            _reg_atr[1] = tag.get("class")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("id"):
                            _reg_atr[0] = 'id'
                            _reg_atr[1] = tag.get("id")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("style"):
                            _reg_atr[0] = 'style'
                            _reg_atr[1] = tag.get("style")
                            _reg_atr[2] = tag
                            break

            # test _reg_atr. If there is no deliverable region, then all
            # documents make the region
            if _reg_atr[2] == None:
                return 0
            reg_flag = False
            # visit all tag in parent_tag
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        tag.text = tag.find('img').tail
                    else:
                        tag.text = ' '
                if tag.text:
                    if re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # next similar title, END of region
                        if reg_flag:
                            break
                # region content
                if tag in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(tag)
                    children.remove(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(
                        tag.getparent())
                    children.remove(tag.getparent())
        if not self._result_html_region:
            return 0
        # create ElementTree from region
        try:
            return lxml.etree.fromstring(self._result_html_region)
        except:
            try:
                parser = lxml.etree.HTMLParser()
                return lxml.etree.fromstring(self._result_html_region, parser)
            except lxml.etree.XMLSyntaxError:
                return 0