def get_region(self, url, base, tolerance): """ Get data region. Returns element tree with region where are deliverables stored """ (gresult, errmsg) = self.htmlHandler.ghap(url) if gresult == -1: return derrno.__err__(errmsg) # initialize charset to encode the page self.formatter.set_charset(self.htmlHandler.get_charset()) # get anchors carying link to deliverable <a href="./deliverable.pdf"> deliv_elements = self.htmlHandler.find_anchor_elem(base=base) if len(deliv_elements) == 0: return derrno.__err__(derrno.ENODOC, url) if len(deliv_elements) == 1: return lxml.etree.ElementTree(deliv_elements[0]) # get parent tag of all deliverable anchors parent_element = self._get_common_parent(deliv_elements, tolerance) if parent_element == None: return derrno.__err__(derrno.ENOREG, "Parent element not found.") # get the region out of the parent element region = self._get_deliverable_region(parent_element) # if parent tag is region if region == 0: # return element tree made from parent tag return lxml.etree.ElementTree(parent_element) else: print #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True) return region # else return region
def get_region(self, url, base, tolerance): _res = self.agent.ghap(url) if len(_res) == 0: return derrno.__err__(errmsg) else: self._page = self.agent.get_etree() deliv_elements = self.agent.find_anchor_elem(base=base) if len(deliv_elements) == 0: return derrno.__err__(derrno.ENODOC, url) if len(deliv_elements) == 1: return lxml.etree.ElementTree(deliv_elements[0]) # get parent tag of all deliverable anchors parent_element = self._get_common_parent(deliv_elements, tolerance) if parent_element == None: return derrno.__err__(derrno.ENOREG, "Parent element not found.") # get the region out of the parent element region = self._get_deliverable_region(parent_element) # if parent tag is region if region == 0: # return element tree made from parent tag return lxml.etree.ElementTree(parent_element) else: print #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True) return region # else return region
def get_deliverable_page(self): """ Returns list of links on pages with deliverable-documents. If found returns list, if not found, return -1. Only public method in module. """ # the main searching loop # while we have some unvisited links, search while self._handle_unvis_links(): # security case if len(self._link_stack) > 100: break self.__debug("Stack content: " + str(self._link_stack)) if len(self._link_stack) == 1: return derrno.__err__(derrno.ELNOTFOUND) final_link = self._get_highest_ranks_link() if not final_link or self._link_stack[final_link][2] == 0: return derrno.__err__(derrno.ELNOTFOUND) self.__debug("#" * 79) self.__debug("DELIVERABLE PAGE: " + final_link) return [final_link] ####### not in use ############# result = self._get_deliv_link_list(final_link) if len(result) == 0: return derrno.__err__(derrno.ELNOTFOUND) else: return result
def get_region(self, url, base, tolerance): (gresult, errmsg) = self.htmlHandler.ghap(url) if gresult == -1: return derrno.__err__(errmsg) # initialize charset to encode the page self.formatter.set_charset(self.htmlHandler.get_charset()) # get anchors carying link to deliverable <a href="./deliverable.pdf"> deliv_elements = self.htmlHandler.find_anchor_elem(base=base) if len(deliv_elements) == 0: return derrno.__err__(derrno.ENODOC, url) if len(deliv_elements) == 1: return lxml.etree.ElementTree(deliv_elements[0]) # get parent tag of all deliverable anchors parent_element = self._get_common_parent(deliv_elements, tolerance) if parent_element == None: return derrno.__err__(derrno.ENOREG, "Parent element not found.") # get the region out of the parent element region = self._get_deliverable_region(parent_element) # if parent tag is region if region == 0: # return element tree made from parent tag return lxml.etree.ElementTree(parent_element) else: print #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True) return region # else return region
def get_deliverables_XML(self): """return infromations about deliverables stored in objects as xml""" if len(self.get_deliverables()) == 0: return derrno.__err__(derrno.ENOREC) output = StringIO.StringIO() converter = Model2XMLConverter(stream=output) converter.convert(self.get_deliverables()) result = output.getvalue() output.close() return result
def get_deliverables_XML(self): """return infromations about deliverables stored in objects as xml""" if len(self.get_deliverables())==0: return derrno.__err__(derrno.ENOREC) output = StringIO.StringIO() converter = Model2XMLConverter(stream=output) converter.convert(self.get_deliverables()) result = output.getvalue() output.close() return result
def _manual_process_page(self, links, baseurl): _err = None recordlist = [] self.baseUrl = baseurl for link in links: # find region with tolerance self.parentetree = self.regionHandler.get_region(link, baseurl, 1) if type(self.parentetree) == tuple: # error _err = self.parentetree continue # get the charset. We dont have etree in htmlHandler, # so we have to use the one from regionHandler self.formatter.set_charset( self.regionHandler.formatter.get_charset()) self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " + "*" * 40) self.__debug( lxml.etree.tostring(self.parentetree, pretty_print=True)) # get root tag try: self.parentetree = self.parentetree.getroot() except: pass # Parent tag is table # call _handle_table if self.parentetree.tag in ('table', 'tbody'): self.__verbose("Handling table") _result = self._handle_table() # if we had a dictionary, continue filling it if len(recordlist) > 0: for key in _result: recordlist[key] = _result[key] else: recordlist = _result # Parent tag is not table # call _handle_sequence else: self.__verbose("Handling sequences") _result = self._handle_sequence() recordlist.extend(_result) # no records found if len(recordlist) == 0: if not _err == None: return _err return derrno.__err__(derrno.ENOREC) self.__debug("DATA RECORDS: ") self.__debug(recordlist) return recordlist # returns list of records
def _manual_process_page(self, links, baseurl): """ Get records from region according document links """ _err = None recordlist = [] self.baseUrl = baseurl for link in links: # find region with tolerance self.parentetree = self.regionHandler.get_region(link, baseurl, 1) if type(self.parentetree) == tuple: # error _err = self.parentetree continue # get the charset. We dont have etree in htmlHandler, # so we have to use the one from regionHandler self.formatter.set_charset(self.regionHandler.formatter.get_charset()) self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40) self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True)) # get root tag try: self.parentetree = self.parentetree.getroot() except: pass # Parent tag is table # call _handle_table if self.parentetree.tag in ('table','tbody'): self.__verbose("Handling table") _result = self._handle_table() # if we had a dictionary, continue filling it if len(recordlist) > 0: for key in _result: recordlist[key] = _result[key] else: recordlist = _result # Parent tag is not table # call _handle_sequence else: self.__verbose("Handling sequences") _result = self._handle_sequence() recordlist.extend(_result) # no records found if len(recordlist) == 0: if not _err == None: return _err return derrno.__err__(derrno.ENOREC) self.__debug("DATA RECORDS: ") self.__debug(recordlist) return recordlist # returns list of records
def get_deliverable_page(self): # the main searching loop # while we have some unvisited links, search while self._handle_unvis_links(): # security case if len(self._link_stack) > 10: break self.__debug("Stack content: "+str(self._link_stack)) if len(self._link_stack) == 1 : return derrno.__err__(derrno.ELNOTFOUND) final_link = self._get_highest_ranks_link() if not final_link or self._link_stack[final_link][2] == 0: return derrno.__err__(derrno.ELNOTFOUND) self.__debug('#'*79) self.__debug("DELIVERABLE PAGE: "+final_link) return [final_link] ####### not in use ############# result = self._get_deliv_link_list(final_link) if len(result) == 0: return derrno.__err__(derrno.ELNOTFOUND) else: return result
def get_deliverables(self): """return objects containing infromations""" if len(self._records) == 0: return derrno.__err__(derrno.ENOREC) else: return self._records