class GetDeliverableRegion: def __init__(self): # init agent for parsing html self.agent = GetHTMLAndParse() # format text self.formatter = TextFormatUtils() """ Get data region. Returns element tree with region where are deliverables stored """ def get_region(self, url, base, tolerance): _res = self.agent.ghap(url) if len(_res) == 0: return derrno.__err__(errmsg) else: self._page = self.agent.get_etree() deliv_elements = self.agent.find_anchor_elem(base=base) if len(deliv_elements) == 0: return derrno.__err__(derrno.ENODOC, url) if len(deliv_elements) == 1: return lxml.etree.ElementTree(deliv_elements[0]) # get parent tag of all deliverable anchors parent_element = self._get_common_parent(deliv_elements, tolerance) if parent_element == None: return derrno.__err__(derrno.ENOREG, "Parent element not found.") # get the region out of the parent element region = self._get_deliverable_region(parent_element) # if parent tag is region if region == 0: # return element tree made from parent tag return lxml.etree.ElementTree(parent_element) else: print #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True) return region # else return region """ Stabile searching parent of all elements in elem_list using method of making element parent vectors and comparing them. Tolerance of n tags makes the region smaller if there are >>not deliverable<< pdfs in more regions on the page.""" def _get_common_parent(self, elem_list, tolerance): # supporting method - kind of bigger lambda. Get minimal length of # inside lists. def _minlength(seq_list): return min([len(seq) for seq in seq_list]) # next supporting method: check the elements in list. # if elements are the same, its common parent tag - return True. def _iscommon(elem_seq, tol): tol_list = [] for elem in elem_seq: if not elem in tol_list: tol_list.append(elem) if len(tol_list) > tol + 1: return False # if only two anchors found then we have only two tags # and its pretty hard to use tolerance, so we omit it. if len(elem_seq) < 3 and len(tol_list) > 1: return False return True # get the most frequenced tag in list def _most_frequent(seq): suplist = [] suplist_freq = [] for el in seq: if not el in suplist: suplist.append(el) suplist_freq.append(int(1)) else: suplist_freq[suplist.index(el)] += 1 ind = suplist_freq.index(max(suplist_freq)) return suplist[ind] # # now continue with method _get_common_parent() # vectors = [] # here will be vectors stored - list of lists for self.elem in elem_list: _vector = [] while 1: parent = self.elem.getparent() # exception possible here if parent == None: break _vector.append(parent) self.elem = parent vectors.append(_vector) # We have parent vectors of all elements from elem_list stored in list # $vectors. Then zip the vector list and get sequences of parent tags (and the # other tags) sorted from the highest to the lowest parent element. zipped = [[row[-i] for row in vectors] for i in range(1, _minlength(vectors) + 1)] # now check all lists in list zipped. If these are filled with the same # elements, its a common parent. The last list before difference contains # the main parent tag. self.last_seq = [] for zipvect in zipped: if not _iscommon(zipvect, tolerance): # return most frequented element in last vector return _most_frequent(self.last_seq) self.last_seq = zipvect return _most_frequent(self.last_seq) """ Get texts from element and his descendants. If string is True, returns texts as one string with spaces. elem: lxml element """ def _get_element_texts(self, elem, string=True): texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts """ Get deliverable region - returns etree with region. If 0 returned parent_tag is region, if -1 returned some error occured searching, if html string returned its a region. """ def _get_deliverable_region(self, parent_tag): def _convert_tag_to_html(tag): tag_html = lxml.etree.ElementTree(tag) return lxml.etree.tostring(tag_html) # list[0] = type, list[1] = atribute, list[2] = lxml tag element # in case of headers list[0] = element.tag, then [2] is element _reg_atr = ['', None, None] self._result_html_region = '' reg_flag = False # flag indicating that we are looping over region # get headers first headers = [] #lxml.etree.ElementTree(parent_tag).write(sys.stdout,pretty_print=True) for i in range(1, 7): headers.extend(parent_tag.findall('.//h' + str(i))) children = parent_tag.getchildren() if len(headers) > 0: for head in headers: text = self._get_element_texts(head) if text: if re.search("deliverables", text, re.I): _reg_atr[0] = head.tag _reg_atr[2] = head break if _reg_atr[2] == None: return 0 # visit all tag in parent_tag for tag in parent_tag.iterdescendants(): if tag.tag == 'img': continue text = self._get_element_texts(tag) if tag.tag == 'a' and not tag.text: if tag.find('img') is not None: text = tag.find('img').tail else: text = ' ' if text: if re.search("deliverables", text, re.I) and \ tag.tag == _reg_atr[0]: # "deliverable" title, BEGIN of region reg_flag = True elif not re.search("deliverables", text, re.I) and \ tag.tag == _reg_atr[0]: # next similar title, END of region if reg_flag: break # region content if tag in children and reg_flag: self._result_html_region += _convert_tag_to_html(tag) elif tag.getparent() in children and reg_flag: self._result_html_region += _convert_tag_to_html( tag.getparent()) children.remove(tag.getparent()) # if we dont have headers, try to find other kind of header (title) # "Deliverables" and compare with other elements with the same class or id. else: for tag in parent_tag.iter(): if tag.text: if re.search("deliverables", tag.text, re.I): if tag.get("class"): _reg_atr[0] = 'class' _reg_atr[1] = tag.get("class") _reg_atr[2] = tag break elif tag.get("id"): _reg_atr[0] = 'id' _reg_atr[1] = tag.get("id") _reg_atr[2] = tag break elif tag.get("style"): _reg_atr[0] = 'style' _reg_atr[1] = tag.get("style") _reg_atr[2] = tag break # test _reg_atr. If there is no deliverable region, then all # documents make the region if _reg_atr[2] == None: return 0 reg_flag = False # visit all tag in parent_tag for tag in parent_tag.iterdescendants(): if tag.tag == 'a' and not tag.text: if tag.find('img') is not None: tag.text = tag.find('img').tail else: tag.text = ' ' if tag.text: if re.search("deliverables", tag.text, re.I) and \ tag.get(_reg_atr[0]) == _reg_atr[1]: # "deliverable" title, BEGIN of region reg_flag = True elif not re.search("deliverables", tag.text, re.I) and \ tag.get(_reg_atr[0]) == _reg_atr[1]: # next similar title, END of region if reg_flag: break # region content if tag in children and reg_flag: self._result_html_region += _convert_tag_to_html(tag) children.remove(tag) elif tag.getparent() in children and reg_flag: self._result_html_region += _convert_tag_to_html( tag.getparent()) children.remove(tag.getparent()) if not self._result_html_region: return 0 # create ElementTree from region try: return lxml.etree.fromstring(self._result_html_region) except: try: parser = lxml.etree.HTMLParser() return lxml.etree.fromstring(self._result_html_region, parser) except lxml.etree.XMLSyntaxError: return 0
class GetDeliverableRegion: def __init__(self): # init agent for parsing html self.agent = GetHTMLAndParse() # format text self.formatter = TextFormatUtils() """ Get data region. Returns element tree with region where are deliverables stored """ def get_region(self, url, base, tolerance): _res = self.agent.ghap(url) if len(_res) == 0: return derrno.__err__(errmsg) else: self._page = self.agent.get_etree() deliv_elements = self.agent.find_anchor_elem(base=base) if len(deliv_elements) == 0: return derrno.__err__(derrno.ENODOC, url) if len(deliv_elements) == 1: return lxml.etree.ElementTree(deliv_elements[0]) # get parent tag of all deliverable anchors parent_element = self._get_common_parent(deliv_elements, tolerance) if parent_element == None: return derrno.__err__(derrno.ENOREG, "Parent element not found.") # get the region out of the parent element region = self._get_deliverable_region(parent_element) # if parent tag is region if region == 0: # return element tree made from parent tag return lxml.etree.ElementTree(parent_element) else: print #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True) return region # else return region """ Stabile searching parent of all elements in elem_list using method of making element parent vectors and comparing them. Tolerance of n tags makes the region smaller if there are >>not deliverable<< pdfs in more regions on the page.""" def _get_common_parent(self, elem_list, tolerance): # supporting method - kind of bigger lambda. Get minimal length of # inside lists. def _minlength(seq_list): return min([len(seq) for seq in seq_list]) # next supporting method: check the elements in list. # if elements are the same, its common parent tag - return True. def _iscommon(elem_seq, tol): tol_list = [] for elem in elem_seq: if not elem in tol_list: tol_list.append(elem) if len(tol_list) > tol+1: return False # if only two anchors found then we have only two tags # and its pretty hard to use tolerance, so we omit it. if len(elem_seq) < 3 and len(tol_list) > 1: return False return True # get the most frequenced tag in list def _most_frequent(seq): suplist = [] suplist_freq = [] for el in seq: if not el in suplist: suplist.append(el) suplist_freq.append(int(1)) else: suplist_freq[suplist.index(el)] += 1 ind = suplist_freq.index(max(suplist_freq)) return suplist[ind] # # now continue with method _get_common_parent() # vectors = [] # here will be vectors stored - list of lists for self.elem in elem_list: _vector = [] while 1: parent = self.elem.getparent() # exception possible here if parent == None: break _vector.append(parent) self.elem = parent vectors.append(_vector) # We have parent vectors of all elements from elem_list stored in list # $vectors. Then zip the vector list and get sequences of parent tags (and the # other tags) sorted from the highest to the lowest parent element. zipped = [[row[-i] for row in vectors] for i in range(1, _minlength(vectors)+1)] # now check all lists in list zipped. If these are filled with the same # elements, its a common parent. The last list before difference contains # the main parent tag. self.last_seq = [] for zipvect in zipped: if not _iscommon(zipvect, tolerance): # return most frequented element in last vector return _most_frequent(self.last_seq) self.last_seq = zipvect return _most_frequent(self.last_seq) """ Get texts from element and his descendants. If string is True, returns texts as one string with spaces. elem: lxml element """ def _get_element_texts(self, elem, string=True): texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts """ Get deliverable region - returns etree with region. If 0 returned parent_tag is region, if -1 returned some error occured searching, if html string returned its a region. """ def _get_deliverable_region(self, parent_tag): def _convert_tag_to_html(tag): tag_html = lxml.etree.ElementTree(tag) return lxml.etree.tostring(tag_html) # list[0] = type, list[1] = atribute, list[2] = lxml tag element # in case of headers list[0] = element.tag, then [2] is element _reg_atr = ['',None,None] self._result_html_region = '' reg_flag = False # flag indicating that we are looping over region # get headers first headers = [] #lxml.etree.ElementTree(parent_tag).write(sys.stdout,pretty_print=True) for i in range(1,7): headers.extend(parent_tag.findall('.//h'+str(i))) children = parent_tag.getchildren() if len(headers) > 0: for head in headers: text = self._get_element_texts(head) if text: if re.search("deliverables", text, re.I): _reg_atr[0] = head.tag _reg_atr[2] = head break if _reg_atr[2] == None: return 0 # visit all tag in parent_tag for tag in parent_tag.iterdescendants(): if tag.tag == 'img': continue; text = self._get_element_texts(tag) if tag.tag == 'a' and not tag.text: if tag.find('img') is not None: text = tag.find('img').tail else: text = ' ' if text: if re.search("deliverables", text, re.I) and \ tag.tag == _reg_atr[0]: # "deliverable" title, BEGIN of region reg_flag = True elif not re.search("deliverables", text, re.I) and \ tag.tag == _reg_atr[0]: # next similar title, END of region if reg_flag: break # region content if tag in children and reg_flag: self._result_html_region += _convert_tag_to_html(tag) elif tag.getparent() in children and reg_flag: self._result_html_region+=_convert_tag_to_html(tag.getparent()) children.remove(tag.getparent()) # if we dont have headers, try to find other kind of header (title) # "Deliverables" and compare with other elements with the same class or id. else: for tag in parent_tag.iter(): if tag.text: if re.search("deliverables", tag.text, re.I): if tag.get("class"): _reg_atr[0] = 'class' _reg_atr[1] = tag.get("class") _reg_atr[2] = tag break elif tag.get("id"): _reg_atr[0] = 'id' _reg_atr[1] = tag.get("id") _reg_atr[2] = tag break elif tag.get("style"): _reg_atr[0] = 'style' _reg_atr[1] = tag.get("style") _reg_atr[2] = tag break # test _reg_atr. If there is no deliverable region, then all # documents make the region if _reg_atr[2] == None: return 0 reg_flag = False # visit all tag in parent_tag for tag in parent_tag.iterdescendants(): if tag.tag == 'a' and not tag.text: if tag.find('img') is not None: tag.text = tag.find('img').tail else: tag.text = ' ' if tag.text: if re.search("deliverables", tag.text, re.I) and \ tag.get(_reg_atr[0]) == _reg_atr[1]: # "deliverable" title, BEGIN of region reg_flag = True elif not re.search("deliverables", tag.text, re.I) and \ tag.get(_reg_atr[0]) == _reg_atr[1]: # next similar title, END of region if reg_flag: break # region content if tag in children and reg_flag: self._result_html_region += _convert_tag_to_html(tag) children.remove(tag) elif tag.getparent() in children and reg_flag: self._result_html_region+=_convert_tag_to_html(tag.getparent()) children.remove(tag.getparent()) if not self._result_html_region: return 0 # create ElementTree from region try: return lxml.etree.fromstring(self._result_html_region) except: try: parser = lxml.etree.HTMLParser() return lxml.etree.fromstring(self._result_html_region, parser) except lxml.etree.XMLSyntaxError: return 0
class GetDelivPage: def __init__(self, url, verbose=False, debug=False, addkeyw=None): # keywords used for document page search self._sigwords = ["d((eliverables?)|[0-9])", "documents?", "reports?", "public(ation)?s?", "results?", "presentations?", "library", #"projects?", "outocomes?", "downloads?", "outputs?"] if addkeyw != None: self._sigwords.append(addkeyw) """ Associative array containing links with their flags { url : [Index/NoIndex/Frame, Visit/Visited, Rank] } index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """ self._link_stack = { url : [0,0,0] } self.base_url = url # save base (input) url # Open an parsing agent to get needed data from page self.agent = GetHTMLAndParse() self._current_url = url # a constant used to set rank in order of importance of the expression # being tested (self._sigwords) self.rank_const = len(self._sigwords) # few a constants for dictionary - just for good-looking source code self.IND_FR = 0 # index/noindex/frame/special self.VISIT = 1 # unvisited/visited self.RANK = 2 # value of rank # set verbose flag self.__verbose__ = verbose #set debug flag self.__dbg__ = debug # checking data types if not type(self.__verbose__) == bool: raise ValueError("Verbose flag has to be boolean.") def __verbose(self, msg): _err = "cannot decode verbose message." if self.__verbose__ == True: try: print(str(msg)) except UnicodeError: print(_err) def __debug(self, msg): _err = "cannot decode debug info." if self.__dbg__ == True: try: print("Debug message: "+str(msg)) except UnicodeError: print(_err) ################################################################################ """ Initialize item in dictionary to noindex/unvisited/rank=0 """ def _link_item_init__(self, link, index=1, visit=0, rank=0): # default setting: noindex,unvisited,norank if not self._link_stack.has_key(link): self._link_stack[link] = [index,visit,rank] return """ Edits item in dictionary self._link_stack """ def _link_item_edit(self, link, index=None, visit=None, rank=None): if index is not None: self._link_stack[link][self.IND_FR] = index if visit is not None: self._link_stack[link][self.VISIT] = visit if rank is not None: # null rank if zero is argument if rank == 0: self._link_stack[link][self.RANK] = 0 # add rank else: self._link_stack[link][self.RANK] += rank return """ Method representing one level of cascade. Do almost any job to search one word in dictionary """ def _level_job(self, index=None): # get list of links from anchors containing one of expression # from self_sigwords result = 0 if index is not None: # searching with one link_list = self.agent.get_all_links( regul = re.compile(self._sigwords[index], re.I), base = self._current_url) else: link_list = self.agent.get_all_links(base = self._current_url) index = self.rank_const if link_list: # # RANK giving & filter # if index is None: rank = 0 elif index == 0: rank = self.rank_const * 2 else: rank = self.rank_const - index for link in link_list: # GTFO javascript if not link or "javascript:" in link or "mailto:" in link: continue if "#" in link: # if pointer delete it link = re.sub('#.*$', '', link) if len(link) > 200: continue if self._link_stack.get(link): # RANK if you see those links for first if self._link_stack[link][self.VISIT] == 0: self._link_item_edit(self._current_url, rank=rank) continue if not self.agent.compare_domains(self.base_url, link): continue split_link = re.sub("https?://.+?/", "", link) # check whether it is file or not if self.agent.is_wanted_mime(link): # # Some PDF or DOC found # # RANK self._link_item_edit(self._current_url, rank=10) self.__debug("Added rank 10 to "+self._current_url) # if re.search("de?l?(iverable)?[0-9]+([\._-][0-9])?", split_link, re.I): self.__debug("Type D on "+self._current_url) # debug print # RANK self._link_item_edit(self._current_url, rank=100) continue elif not self.agent.is_page(link): continue self.__debug("UNWATED") # # Add link # # RANK # initialization of link item in dict self._link_item_init__(link) self._link_item_edit(self._current_url, rank=rank) result += 1 # debug print self.__debug("ADD "+link[7:60]) self.__debug("Rank "+str(rank)+" "+self._current_url) return result """ Cascade search. May improve the speed of script """ def _cascade_search(self): result = 0 # first cascade - look for links cont. deliverables result += self._level_job(0) if not result == 0: return # second cascade - look for links cont. documents and publications result += self._level_job(1) result += self._level_job(2) if not result == 0: return # last cascade - all the rest for i in range(3,self.rank_const): result += self._level_job(i) # check Intro page (all links) only on index if result == 0 and self._link_stack[self._current_url][0] == 0: result += self._level_job() """if result == 0: # RANK DOWN self._link_item_edit(self._current_url, rank=0) print "No anchors on the page""" return """ TRY TO repair link. But for now only append / in base """ def _repair_links(self, base=None): if base is None: base = self.base_url if re.match(".*[^/]$", base): base += "/" if self.agent.get_etree() == -1: return -1 links = self.agent.get_all_links(base = base) # compare link with base url for link in links: if not self.agent.compare_domains(self.base_url, link): continue link = re.sub("https?://.+?/", base, link) # if match, save it as special case self._link_item_init__(link, index=3) """ Checking intro page. It is page without content, only with Enter label """ def _check_intro(self): links = self.agent.get_all_links(base = self._current_url) self.__debug("We've found intro links: "+str(links)) for link in links: if not self.agent.compare_domains(self.base_url, link): continue # save new link as normal page self._link_item_init__(link, index=1) """ Looks for frames on the page """ def _check_frames(self): frames = self.agent.look_for_frame(base = self._current_url) if not frames: return None fcount = len(frames) # debug print self.__debug("We've found frames ("+str(fcount)+") on "+self._current_url) # save new link as frame page for link in frames: if self.agent.compare_domains(self._current_url, link): self._link_item_init__(link, index=2) return fcount """ Checks for titles and gives rank according the result """ def _check_titles(self): for i in range(self.rank_const): hcount = self.agent.count_all_headers( re.compile( self._sigwords[i], re.I )) if not hcount == 0: if i == 0: # # "deliverable" match, the highest rank # # RANK constant is multiplied by 4 self.__debug("deliverable match"+str(self.rank_const * 4)+" "+self._current_url) self._link_item_edit(self._current_url, rank = self.rank_const * 4) else: # # other word match # # do not multiplied rank constant self.__debug("Rank "+str(self.rank_const - i)+" "+self._current_url) self._link_item_edit(self._current_url, rank = self.rank_const - i) """ Get information about current link """ def _check_anchor(self): # tt is Text and Title tt = self.agent.get_anchor_from_link(self._current_url) # return 0 if no anchor match if tt == 0: return tt; # match for deliverables if re.search(self._sigwords[0], tt, re.I): self.__debug("Anchor matched "+self._current_url) # debug print return 1 """ Returns list of unvisited links. Useful in cycle. """ def _check_unvisited_links(self): unvisitedLinks = [] for link in self._link_stack: if self._link_stack[link][self.VISIT] == 0: # if unvisited unvisitedLinks.append(link) return unvisitedLinks # list of unvisited page links """ Aplying all methods to unvisited links - next level of searching. It is main private method. Only this method can decide end of searching """ def _handle_unvis_links(self): unvisLinks = self._check_unvisited_links() if not unvisLinks: return None # end of searching for link in unvisLinks: # cycle in unvisited links # visit and parse page self._link_item_edit(link, visit = 1) (res, err) = self.agent.ghap(link) if res == -1: self.__debug(str(err)+" "+str(link)) # debug print # if link is broken (IND_FR == 3) if self._link_stack[link][self.IND_FR] != 3: self._repair_links() continue # little hack with error message, there is no error but URL! if res == 2: self.base_url = err # URL of the new base self.__debug("Getting url in ghap(): "+str(link)) # debug print self.__verbose("Searching... URL: "+str(link)) # verbose print self._current_url = link if self._link_stack[link][self.IND_FR] == 2: dname = self.agent.get_domain_name(link) if dname is not None: self.base_url = dname ############### # frame check # self._check_frames() ################ # titles check # self._check_titles() # rank giving here ################ # anchor check # if self._check_anchor(): self._link_item_edit(link, rank = 10) # rank giving here too self._cascade_search() # search for next links on this page # when no unvisited links in list, return return 1 """ Returns link of the highest value of rank in self._link_stack. It is called in the end of process.""" def _get_highest_ranks_link(self): hRank = 0 hLink = "" # check all links and choose link with the highest rank for link in self._link_stack: if self._link_stack[link][self.RANK] > hRank: hLink = link hRank = self._link_stack[link][self.RANK] return hLink # WINNER """ Returns list of all links leading to deliverables. Try to find more sites with deliverables.. i.e. like www.awissenet.com has. Maybe test for name of link - anchor: i.e. next, prev, [0-9]+ and so one... Page usualy looks like: next pages: 1 2 3 4 ... """ def _get_deliv_link_list(self,first_link): # agent gets first_link final_list = [] nonvisited = [first_link] current = nonvisited.pop() while current: if not current or "javascript:" in current or "mailto:" in current: try: current = nonvisited.pop() except: break continue if self.agent.ghap(current)[0] == -1: # CACHE ??? maybe try: current = nonvisited.pop() except: break continue nonvisited.extend(self.agent.get_pager_links(base=current)) final_list.append(current) # append only one link try: current = nonvisited.pop() except: break return final_list # returning all pages with deliverables """ Returns list of links on pages with deliverable-documents. If found returns list, if not found, return -1. Only public method in module. """ def get_deliverable_page(self): # the main searching loop # while we have some unvisited links, search while self._handle_unvis_links(): # security case if len(self._link_stack) > 10: break self.__debug("Stack content: "+str(self._link_stack)) if len(self._link_stack) == 1 : return derrno.__err__(derrno.ELNOTFOUND) final_link = self._get_highest_ranks_link() if not final_link or self._link_stack[final_link][2] == 0: return derrno.__err__(derrno.ELNOTFOUND) self.__debug('#'*79) self.__debug("DELIVERABLE PAGE: "+final_link) return [final_link] ####### not in use ############# result = self._get_deliv_link_list(final_link) if len(result) == 0: return derrno.__err__(derrno.ELNOTFOUND) else: return result