def get_region(self, url, base, tolerance):
        """ Get data region. 
        Returns element tree with region where are deliverables stored """
        (gresult, errmsg) = self.htmlHandler.ghap(url)
        if gresult == -1:
            return derrno.__err__(errmsg)
        
        # initialize charset to encode the page
        self.formatter.set_charset(self.htmlHandler.get_charset())
        # get anchors carying link to deliverable <a href="./deliverable.pdf">
        deliv_elements = self.htmlHandler.find_anchor_elem(base=base)
        if len(deliv_elements) == 0:
            return derrno.__err__(derrno.ENODOC, url)
        if len(deliv_elements) == 1:
            return lxml.etree.ElementTree(deliv_elements[0])

        # get parent tag of all deliverable anchors
        parent_element = self._get_common_parent(deliv_elements, tolerance)
        if parent_element == None:
            return derrno.__err__(derrno.ENOREG, "Parent element not found.")
      
        # get the region out of the parent element
        region = self._get_deliverable_region(parent_element)
        # if parent tag is region
        if region == 0:
            # return element tree made from parent tag
            return lxml.etree.ElementTree(parent_element)
        else:
            print
            #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True)
        return region # else return region
Esempio n. 2
0
    def get_region(self, url, base, tolerance):
        _res = self.agent.ghap(url)
        if len(_res) == 0:
            return derrno.__err__(errmsg)
        else:
            self._page = self.agent.get_etree()

        deliv_elements = self.agent.find_anchor_elem(base=base)
        if len(deliv_elements) == 0:
            return derrno.__err__(derrno.ENODOC, url)
        if len(deliv_elements) == 1:
            return lxml.etree.ElementTree(deliv_elements[0])

        # get parent tag of all deliverable anchors
        parent_element = self._get_common_parent(deliv_elements, tolerance)
        if parent_element == None:
            return derrno.__err__(derrno.ENOREG, "Parent element not found.")

        # get the region out of the parent element
        region = self._get_deliverable_region(parent_element)
        # if parent tag is region
        if region == 0:
            # return element tree made from parent tag
            return lxml.etree.ElementTree(parent_element)
        else:
            print
            #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True)
        return region  # else return region
    def get_region(self, url, base, tolerance):
        _res = self.agent.ghap(url)
        if len(_res) == 0:
            return derrno.__err__(errmsg)
        else:
            self._page = self.agent.get_etree()
           
        deliv_elements = self.agent.find_anchor_elem(base=base)
        if len(deliv_elements) == 0:
            return derrno.__err__(derrno.ENODOC, url)
        if len(deliv_elements) == 1:
            return lxml.etree.ElementTree(deliv_elements[0])

        # get parent tag of all deliverable anchors
        parent_element = self._get_common_parent(deliv_elements, tolerance)
        if parent_element == None:
            return derrno.__err__(derrno.ENOREG, "Parent element not found.")

        # get the region out of the parent element
        region = self._get_deliverable_region(parent_element)
        # if parent tag is region
        if region == 0:
            # return element tree made from parent tag
            return lxml.etree.ElementTree(parent_element)
        else:
            print
            #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True)
        return region # else return region
Esempio n. 4
0
    def get_deliverable_page(self):
        """ Returns list of links on pages with deliverable-documents.
		If found returns list, if not found, return -1. 
		Only public method in module. """
        # the main searching loop
        # while we have some unvisited links, search
        while self._handle_unvis_links():
            # security case
            if len(self._link_stack) > 100:
                break
            self.__debug("Stack content: " + str(self._link_stack))
        if len(self._link_stack) == 1:
            return derrno.__err__(derrno.ELNOTFOUND)

        final_link = self._get_highest_ranks_link()
        if not final_link or self._link_stack[final_link][2] == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        self.__debug("#" * 79)
        self.__debug("DELIVERABLE PAGE: " + final_link)
        return [final_link]

        ####### not in use #############
        result = self._get_deliv_link_list(final_link)
        if len(result) == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        else:
            return result
    def get_region(self, url, base, tolerance):
        (gresult, errmsg) = self.htmlHandler.ghap(url)
        if gresult == -1:
            return derrno.__err__(errmsg)

        # initialize charset to encode the page
        self.formatter.set_charset(self.htmlHandler.get_charset())
        # get anchors carying link to deliverable <a href="./deliverable.pdf">
        deliv_elements = self.htmlHandler.find_anchor_elem(base=base)
        if len(deliv_elements) == 0:
            return derrno.__err__(derrno.ENODOC, url)
        if len(deliv_elements) == 1:
            return lxml.etree.ElementTree(deliv_elements[0])

        # get parent tag of all deliverable anchors
        parent_element = self._get_common_parent(deliv_elements, tolerance)
        if parent_element == None:
            return derrno.__err__(derrno.ENOREG, "Parent element not found.")

        # get the region out of the parent element
        region = self._get_deliverable_region(parent_element)
        # if parent tag is region
        if region == 0:
            # return element tree made from parent tag
            return lxml.etree.ElementTree(parent_element)
        else:
            print
            #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True)
        return region  # else return region
Esempio n. 6
0
 def get_deliverables_XML(self):
     """return infromations about deliverables stored in objects as xml"""
     if len(self.get_deliverables()) == 0:
         return derrno.__err__(derrno.ENOREC)
     output = StringIO.StringIO()
     converter = Model2XMLConverter(stream=output)
     converter.convert(self.get_deliverables())
     result = output.getvalue()
     output.close()
     return result
 def get_deliverables_XML(self):
     """return infromations about deliverables stored in objects as xml"""
     if len(self.get_deliverables())==0:
         return derrno.__err__(derrno.ENOREC)
     output = StringIO.StringIO()
     converter = Model2XMLConverter(stream=output)
     converter.convert(self.get_deliverables())
     result = output.getvalue()
     output.close()
     return result
    def _manual_process_page(self, links, baseurl):
        _err = None
        recordlist = []
        self.baseUrl = baseurl

        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                continue

            # get the charset. We dont have etree in htmlHandler,
            # so we have to use the one from regionHandler
            self.formatter.set_charset(
                self.regionHandler.formatter.get_charset())

            self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " +
                         "*" * 40)
            self.__debug(
                lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass

            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table', 'tbody'):
                self.__verbose("Handling table")

                _result = self._handle_table()
                # if we had a dictionary, continue filling it
                if len(recordlist) > 0:
                    for key in _result:
                        recordlist[key] = _result[key]
                else:
                    recordlist = _result
            # Parent tag is not table
            # call _handle_sequence
            else:
                self.__verbose("Handling sequences")

                _result = self._handle_sequence()
                recordlist.extend(_result)
        # no records found
        if len(recordlist) == 0:
            if not _err == None:
                return _err
            return derrno.__err__(derrno.ENOREC)
        self.__debug("DATA RECORDS: ")
        self.__debug(recordlist)
        return recordlist  # returns list of records
 def _manual_process_page(self, links, baseurl):
     """ Get records from region according document links """
     _err = None
     recordlist = []
     self.baseUrl = baseurl
     
     for link in links:
         # find region with tolerance
         self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
         if type(self.parentetree) == tuple:
             # error
             _err = self.parentetree
             continue
         
         # get the charset. We dont have etree in htmlHandler, 
         # so we have to use the one from regionHandler
         self.formatter.set_charset(self.regionHandler.formatter.get_charset())
         
         self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40)
         self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True))
         # get root tag
         try:
             self.parentetree = self.parentetree.getroot()
         except:
             pass
         
         # Parent tag is table
         # call _handle_table
         if self.parentetree.tag in ('table','tbody'):
             self.__verbose("Handling table")
             
             _result = self._handle_table()
             # if we had a dictionary, continue filling it
             if len(recordlist) > 0:
                 for key in _result:
                     recordlist[key] = _result[key]
             else:
                 recordlist = _result
         # Parent tag is not table
         # call _handle_sequence
         else:
             self.__verbose("Handling sequences")
             
             _result = self._handle_sequence()
             recordlist.extend(_result)
     # no records found            
     if len(recordlist) == 0:
         if not _err == None:
             return _err
         return derrno.__err__(derrno.ENOREC)
     self.__debug("DATA RECORDS: ")
     self.__debug(recordlist)
     return recordlist # returns list of records
    def get_deliverable_page(self):
        # the main searching loop 
        # while we have some unvisited links, search
        while self._handle_unvis_links(): 
            # security case
            if len(self._link_stack) > 10:
                break
            self.__debug("Stack content: "+str(self._link_stack))
        if len(self._link_stack) == 1 :
            return derrno.__err__(derrno.ELNOTFOUND)

        final_link = self._get_highest_ranks_link()
        if not final_link or self._link_stack[final_link][2] == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        self.__debug('#'*79)
        self.__debug("DELIVERABLE PAGE: "+final_link)
        return [final_link]
        
        ####### not in use #############
        result = self._get_deliv_link_list(final_link)
        if len(result) == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        else:
            return result
    def get_deliverable_page(self):
        # the main searching loop 
        # while we have some unvisited links, search
        while self._handle_unvis_links(): 
            # security case
            if len(self._link_stack) > 10:
                break
            self.__debug("Stack content: "+str(self._link_stack))
        if len(self._link_stack) == 1 :
            return derrno.__err__(derrno.ELNOTFOUND)

        final_link = self._get_highest_ranks_link()
        if not final_link or self._link_stack[final_link][2] == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        self.__debug('#'*79)
        self.__debug("DELIVERABLE PAGE: "+final_link)
        return [final_link]
        
        ####### not in use #############
        result = self._get_deliv_link_list(final_link)
        if len(result) == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        else:
            return result
Esempio n. 12
0
 def get_deliverables(self):
     """return objects containing infromations"""
     if len(self._records) == 0:
         return derrno.__err__(derrno.ENOREC)
     else:
         return self._records
 def get_deliverables(self):
     """return objects containing infromations"""
     if len(self._records) == 0:
         return derrno.__err__(derrno.ENOREC)
     else:
         return self._records