class GetDelivRecords: def __init__(self, verbose=False, debug=False): self.__dbg__ = debug self.__verbos = verbose self._crawler = Crawler() self._crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20) self._unwanted_titles = ['Download here', 'PDF format'] self._records = [] ################################ #manual processing self.agent = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1 def __debug(self, msg): _err = "cannot decode debug info." if self.__dbg__ == True: try: print("Debug message: " + str(msg)) except UnicodeError: print(_err) def __verbose(self, msg): _err = "cannot decode debug info." if self.__verbose == True: try: print("Verbose: " + str(msg)) except UnicodeError: print(_err) ########################Processing sequencewrapper output###################### """function gets an entry from output of sequence wrapper it tries to create deliv record and retruns true if succed. """ def _make_deliv_record(self, entry): text = [] links = [] #harvest links and text form entry for e in entry.iter(): if e.text != None: text.append(e.text) if e.attrib.get("link") != None: if self.agent.is_wanted_mime(e.attrib.get( "link")) and e.attrib.get("link") not in links: links.append(e.attrib.get("link")) res = self._deliv_in_text(text, links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") return True elif type(res) == list: res = self._more_entry_in_record(entry) if (res == True): self.__debug("") return True else: return False res = self._deliv_in_link(text, links, entry) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") return True return False """look for keyword in text""" def _deliv_in_text(self, text, links): #print text #print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)", re.I) #loop through text in entry looking for title and description for t in text: if _title == False: if pattern.search(t): _title = t #set the longest string as description of deliverable if len(_description) < len(t): _description = t if _title == _description: _description = "" _link = False if type(links) == str: if self.agent.is_wanted_mime(links): _link = links elif type(links) == list: for l in links: if self.agent.is_wanted_mime(l): if _link == False: _link = l else: #if there was already found link if _link[:s.rfind(_link, '.')] == l[:s.rfind(l, '.')]: break else: return ['-3', 'Probably more records in one entry'] #create object if _title: #print "TITLE:"+_title pub = RRSPublication(title=_title, abstract=_description) _typ = RRSPublication_type(type='techreport') pub['type'] = _typ self.__debug("*" * 40) self.__debug("Title: " + _title) self.__debug("Description: " + _description) if _link: #print "LINK:"+_link self.__debug("Link: " + _link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """look for a key word in link""" def _deliv_in_link(self, text, links, entry=False): ##print text ##print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)", re.I) _link = False for l in links: if pattern.search(l): if _link == False: _link = l else: return ['-3', 'Probably more records in one entry'] #loop through text in entry looking for title and description for t in text: if _title == False: if len(t) > 10: _title = t #set the longest string as description of deliverable if len(_description) < len(t): _description = t if _title == _description: _description = "" #if chosen title is not valid try to find better in parent entry if _title and not self._check_title(_title) and entry != False: _title = self._repair_title(entry) #create object if _link: pub = RRSPublication(title=_title, abstract=_description) typ = RRSPublication_type(type='techreport') pub['type'] = typ self.__debug("*" * 40) self.__debug("Title: " + _title) self.__debug("Description: " + _description) self.__debug("Link: " + _link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """Check if title contents only unwanted string with some tolerance return true if title is ok """ def _check_title(self, title, tolerance=10): for t in self._unwanted_titles: if (s.find(s.lower(title), s.lower(t))) != -1: if (len(t) + tolerance) > len(title): return False return True "looks for an element with highest visibility rank in parent elemet" def _repair_title(self, entry): parent = entry.getparent() visibility = 0 title = "" for i in parent.iter(): try: if i.attrib.get('visibility') > visibility: visibility = i.attrib.get('visibility') title = i.text except AttributeError: pass if title != "": return title else: return False "Function try to create array of deliverables from one entry in xml tree" def _more_entry_in_record(self, entry): for ch in entry.iter('chunk'): if ch.text != None and ch.attrib.get("link") != None: if self.agent.is_wanted_mime(ch.attrib.get("link")): _pub = RRSPublication(title=ch.text) typ = RRSPublication_type(type='techreport') _pub['type'] = typ _l = RRSUrl(link=ch.attrib.get("link")) _rel = RRSRelationshipPublicationUrl() _rel.set_entity(_l) _pub['url'] = _rel self._entriesFoundInLinks.append(_pub) "Process pages definied by urls" def process_pages(self, pages): self._entriesFoundInText = [] self._entriesFoundInLinks = [] self._urls = pages self._pages = self._crawler.start(pages) #creates RRSPublication objects with information about deliverables for u in self._urls: self._wraper.wrap(self._pages[u], u) self._tree = self._wraper.get_etree() #print self._wraper.get_xml() for entry in self._tree.iter("entry"): self._make_deliv_record(entry) if len(self._entriesFoundInText) > 3: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks) > 3: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks else: self._manual_processing() "This method is called when ther was no records found in output of sequencewrapper" def _manual_processing(self): self._entriesFoundInLinks = [] self._entriesFoundInText = [] self._manual_process_page(self._urls, urlsplit(self._urls[0])[1]) if len(self._entriesFoundInText) > 0: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks) > 0: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks ########################### TABLE HANDLING METHODS ############################ """ Get texts from element and his descendants. If string isset, returns texts as one string with spaces. # elem: lxml element """ def _get_descendats_texts(self, elem, string=True): texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts """ Get link from row of table - go through columns and the only href leading to deliverable is returned. """ def _get_row_link(self, row): # find all anchors where parent is row linkanch = row.findall('.//a[@href]') if len(linkanch) == 0: return None for link in linkanch: anchor_link = link.get('href') if self.agent.is_wanted_mime( anchor_link): # check if it is file we want return anchor_link return None """ Handle region as a table. Work with region as it's a table. Try to get table semantic (table order) and get all records out of it. """ def _handle_table(self): for row in self.parentetree: if not row.tag == 'tr': continue row_list = [] _thislink = self._get_row_link(row) if _thislink == None: continue for column in row: text = self._get_descendats_texts(column) if not text: continue row_list.append(text) res = self._deliv_in_text(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") del (row_list) return ######################## TAG SEQUENCE RECOGNIZING METHODS #################### """ Tag check. If it is anchor with href leading to deliverable, returns True """ def _is_deliv_anch(self, tag): if tag.tag == 'a': href = tag.get('href') if self.agent.is_wanted_mime(href): return True return False """ Filters useless and messy tags. Return false if useless, true if normal tag """ def _tagfilter(self, tag): if tag.tag in self._omitted_tags: return False #if tag.text: # if not re.search("[a-z0-9\[\]]", tag.text, re.I): # return False return True """ Gets difference between first two anchors. """ def _getdiff(self, reg, tol): # etree reg = element tree region # int tol: accepted tolerance of tags d = {} index = 0 # fill the dictionary with differences and their occurences for tag in reg.iter(): if not self._tagfilter(tag): continue if self._is_deliv_anch(tag) and not index == 0: try: d[index] += 1 except: d[index] = 1 index = 0 index += 1 # check differencies if the variety isn't higher then $tol tolerance difflist = d.keys() self.__debug("difflist: " + str(difflist)) if len(difflist) == 0: return -1 _max = max(difflist) _min = min(difflist) dlen = len(d.keys()) if dlen == 1: return d.keys()[0] if dlen > ((2 * tol) + 1): # tolerance to both sides return -1 if (_max - _min) > 2 * tol: # some acceptable tolerance return -1 # get the most frequent difference most_freq = max(d.values()) for key in d: if d[key] == most_freq: return key return -1 """ Only anchors found. No optional information. """ def _get_anch_only(self): anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree) # We have to make list of list because XMLOutput return [[anch] for anch in anchlist] """ Main method handling tag sequences and recognizing records. Returns list of records. """ def _get_tag_sequences(self, tag_tol=1): records = [] self._rec = [] if len(self.parentetree) == 0: return [[self.parentetree]] # get interval between anchors, use tolerance tag_tol self.difference = self._getdiff(self.parentetree, self.tagtol) while self.difference == -1: if self.tagtol > 5: self.__verbose("Variety of intervals between anchors is too huge. "+\ "Getting data out of anchors only") return self._get_anch_only() self.tagtol += 1 self.difference = self._getdiff(self.parentetree, self.tagtol) # get sequence of first n tags, where n is average interval between anchors # this could be tag-sequence describing all records in region. self.record_seq = [] i = 0 for tag in self.parentetree.iter(): if not self._tagfilter(tag): continue if i >= self.difference: if not 'a' in self.record_seq: del self.record_seq[0] else: break self.record_seq.append(tag.tag) i += 1 # counter indicates on which position in tag sequence we actually are counter = 0 # make sequence of tags as they go regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()]) recseqlen = len(self.record_seq) reglistlen = len(regionlist) # flag indicating begin of records - in region on the beginning can be some garbage self.begin = False # indicating unpredictable separator between deliverable records self.separator = 0 for i, tag in enumerate(regionlist): # skip and save the sequence at the end if counter > self.difference - 1: records.append(self._rec) # save self._rec = [] # erase the list counter = 0 # reset counter if not self.begin: if tag.tag != self.record_seq[0]: continue else: try: if regionlist[i + 1].tag != self.record_seq[1]: continue except: pass self.begin = True # handle tolerances, try to compare sibling tags self.match = False # match flag # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence. for tol in range(self.tagtol + 1): if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \ regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter += tol + 1 break elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \ regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter -= tol counter += 1 break # if nothing matched, its probably out of tolerance if not self.match: self.separator += 1 # tolerance 10 separators (tags between boxes or tables of deliverables) if self.separator > 10: self.__verbose("Tag sequence doesnt match, probably out of "+\ "tolerance, getting data out of anchors only") # maybe here could be tolerance++ # we didnt catch the sequence with tolerance... return self._get_anch_only() records.append(self._rec) return filter(self._validseq, records) """ Helper method - check if sequence of tags rec contains deliv anchor """ def _validseq(self, rec): for _atr in rec: # if we have anchor containing link to document, return true if self._is_deliv_anch(_atr): return True return False """ Get element texts only, dont look for descendants texts """ def _get_tag_content(self, tag): links = [] texts = [] if tag.tag == 'a': href = tag.get('href') # if link leading to document found, add string to list if href is not None and self.agent.is_wanted_mime(href): links.append(self.formatter.format(href)) title = tag.get('title') # if title found in tag, add string to list if title: texts.append(self.formatter.format(title)) # if not anchor, search text in tag.text if tag.text: if re.search("[a-z0-9]", tag.text, re.I): texts.append(self.formatter.format(tag.text)) return [links, texts] """ Harvest texts out of tags and return list of lists (record) """ def _harvest_text(self, record_tag_list): self._records = [] self._rec = [] _links = [] _texts = [] # loop over records and search all possible useful texts for rec_list in record_tag_list: for tag in rec_list: harvested = (self._get_tag_content(tag)) _links.extend(harvested[0]) _texts.extend(harvested[1]) #self._records.append(self._rec) res = self._deliv_in_text(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") _links = [] _texts = [] self._rec = [] return self._records """ Text harvesting for sequences. """ def _handle_sequence(self): seq = self._get_tag_sequences() return self._harvest_text(seq) """ Get records from region according document links this method is used when there was no records found in output of sequencewrapper""" def _manual_process_page(self, links, baseurl): _err = None self.baseUrl = baseurl for link in links: # find region with tolerance self.parentetree = self.regionHandler.get_region(link, baseurl, 1) if type(self.parentetree) == tuple: # error _err = self.parentetree self.__debug(_err) continue #make all links absolute in parent tree hrefs = self.parentetree.findall('.//a[@href]') for href in hrefs: href.make_links_absolute('http://' + urlsplit(link)[1] + '/') # get the charset. We dont have etree in htmlHandler, # so we have to use the one from regionHandler self.formatter.set_charset( self.regionHandler.formatter.get_charset()) self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " + "*" * 40) self.__debug( lxml.etree.tostring(self.parentetree, pretty_print=True)) # get root tag try: self.parentetree = self.parentetree.getroot() except: pass # Parent tag is table # call _handle_table if self.parentetree.tag in ('table', 'tbody'): self.__verbose("Handling table") self._handle_table() else: self.__verbose("Handling sequences") self._handle_sequence() #############PUBLIC METHODS TO GET RESULTS def get_deliverables_XML(self): """return infromations about deliverables stored in objects as xml""" if len(self.get_deliverables()) == 0: return derrno.__err__(derrno.ENOREC) output = StringIO.StringIO() converter = Model2XMLConverter(stream=output) converter.convert(self.get_deliverables()) result = output.getvalue() output.close() return result def get_deliverables(self): """return objects containing infromations""" if len(self._records) == 0: return derrno.__err__(derrno.ENOREC) else: return self._records
class GetDelivRecords: def __init__(self,verbose=False,debug=False): self.__dbg__ = debug self.__verbos = verbose self._crawler = Crawler() self._crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20) self._unwanted_titles = ['Download here','PDF format'] self._records = [] ################################ #manual processing self.agent = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1 def __debug(self, msg): _err = "cannot decode debug info." if self.__dbg__ == True: try: print("Debug message: "+str(msg)) except UnicodeError: print(_err) def __verbose(self,msg): _err = "cannot decode debug info." if self.__verbose == True: try: print("Verbose: "+str(msg)) except UnicodeError: print(_err) ########################Processing sequencewrapper output###################### """function gets an entry from output of sequence wrapper it tries to create deliv record and retruns true if succed. """ def _make_deliv_record(self,entry): text = [] links = [] #harvest links and text form entry for e in entry.iter(): if e.text != None: text.append(e.text) if e.attrib.get("link")!=None: if self.agent.is_wanted_mime(e.attrib.get("link")) and e.attrib.get("link") not in links: links.append(e.attrib.get("link")) res = self._deliv_in_text(text,links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") return True elif type(res)==list: res=self._more_entry_in_record(entry) if(res==True): self.__debug("") return True else: return False res = self._deliv_in_link(text,links,entry) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") return True return False """look for keyword in text""" def _deliv_in_text(self,text,links): #print text #print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I) #loop through text in entry looking for title and description for t in text: if _title == False: if pattern.search(t): _title = t #set the longest string as description of deliverable if len(_description)<len(t): _description = t if _title == _description: _description = "" _link = False if type(links) == str: if self.agent.is_wanted_mime(links): _link = links elif type(links) ==list: for l in links: if self.agent.is_wanted_mime(l): if _link == False: _link = l else: #if there was already found link if _link[:s.rfind(_link,'.')] == l[:s.rfind(l,'.')]: break else: return ['-3','Probably more records in one entry'] #create object if _title: #print "TITLE:"+_title pub = RRSPublication(title=_title,abstract=_description) _typ = RRSPublication_type(type='techreport') pub['type'] = _typ self.__debug("*"*40) self.__debug("Title: "+_title) self.__debug("Description: "+_description) if _link: #print "LINK:"+_link self.__debug("Link: "+_link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """look for a key word in link""" def _deliv_in_link(self,text,links,entry = False): ##print text ##print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I) _link = False for l in links: if pattern.search(l): if _link == False: _link =l else: return ['-3','Probably more records in one entry'] #loop through text in entry looking for title and description for t in text: if _title == False: if len(t)>10 : _title = t #set the longest string as description of deliverable if len(_description)<len(t): _description = t if _title == _description: _description = "" #if chosen title is not valid try to find better in parent entry if _title and not self._check_title(_title) and entry != False: _title = self._repair_title(entry) #create object if _link: pub = RRSPublication(title=_title,abstract=_description) typ = RRSPublication_type(type='techreport') pub['type'] = typ self.__debug("*"*40) self.__debug("Title: "+_title) self.__debug("Description: "+_description) self.__debug("Link: "+_link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False """Check if title contents only unwanted string with some tolerance return true if title is ok """ def _check_title(self,title,tolerance=10): for t in self._unwanted_titles: if (s.find(s.lower(title),s.lower(t))) != -1: if (len(t)+tolerance) > len(title): return False return True "looks for an element with highest visibility rank in parent elemet" def _repair_title(self,entry): parent = entry.getparent() visibility = 0 title = "" for i in parent.iter(): try: if i.attrib.get('visibility') > visibility: visibility = i.attrib.get('visibility') title = i.text except AttributeError: pass if title != "": return title else: return False "Function try to create array of deliverables from one entry in xml tree" def _more_entry_in_record(self,entry): for ch in entry.iter('chunk'): if ch.text != None and ch.attrib.get("link")!=None: if self.agent.is_wanted_mime(ch.attrib.get("link")): _pub= RRSPublication(title=ch.text) typ = RRSPublication_type(type='techreport') _pub['type'] = typ _l = RRSUrl(link=ch.attrib.get("link")) _rel = RRSRelationshipPublicationUrl() _rel.set_entity(_l) _pub['url'] = _rel self._entriesFoundInLinks.append(_pub) "Process pages definied by urls" def process_pages(self,pages): self._entriesFoundInText = [] self._entriesFoundInLinks = [] self._urls = pages self._pages = self._crawler.start(pages) #creates RRSPublication objects with information about deliverables for u in self._urls: self._wraper.wrap(self._pages[u],u) self._tree = self._wraper.get_etree() #print self._wraper.get_xml() for entry in self._tree.iter("entry"): self._make_deliv_record(entry) if len(self._entriesFoundInText)>3: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks)>3: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks else: self._manual_processing() "This method is called when ther was no records found in output of sequencewrapper" def _manual_processing(self): self._entriesFoundInLinks = [] self._entriesFoundInText = [] self._manual_process_page(self._urls, urlsplit(self._urls[0])[1]) if len(self._entriesFoundInText)>0: self.__debug("Deliverbles descriptions content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records") self._records = self._entriesFoundInText elif len(self._entriesFoundInLinks)>0: self.__debug("Deliverbles links content keywords") self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records") self._records = self._entriesFoundInLinks ########################### TABLE HANDLING METHODS ############################ """ Get texts from element and his descendants. If string isset, returns texts as one string with spaces. # elem: lxml element """ def _get_descendats_texts(self, elem, string=True): texts = [] for child in elem.iter(): if child.text and isinstance(child.tag, basestring): if re.search("[a-z0-9]", child.text, re.I): texts.append(self.formatter.format(child.text)) if string: return " ".join(texts) return texts """ Get link from row of table - go through columns and the only href leading to deliverable is returned. """ def _get_row_link(self, row): # find all anchors where parent is row linkanch = row.findall('.//a[@href]') if len(linkanch) == 0: return None for link in linkanch: anchor_link = link.get('href') if self.agent.is_wanted_mime(anchor_link): # check if it is file we want return anchor_link return None """ Handle region as a table. Work with region as it's a table. Try to get table semantic (table order) and get all records out of it. """ def _handle_table(self): for row in self.parentetree: if not row.tag == 'tr': continue row_list = [] _thislink = self._get_row_link(row) if _thislink == None: continue for column in row: text = self._get_descendats_texts(column) if not text: continue row_list.append(text) res = self._deliv_in_text(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(row_list, [_thislink]) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") del(row_list) return ######################## TAG SEQUENCE RECOGNIZING METHODS #################### """ Tag check. If it is anchor with href leading to deliverable, returns True """ def _is_deliv_anch(self, tag): if tag.tag == 'a': href = tag.get('href') if self.agent.is_wanted_mime(href): return True return False """ Filters useless and messy tags. Return false if useless, true if normal tag """ def _tagfilter(self, tag): if tag.tag in self._omitted_tags: return False #if tag.text: # if not re.search("[a-z0-9\[\]]", tag.text, re.I): # return False return True """ Gets difference between first two anchors. """ def _getdiff(self, reg, tol): # etree reg = element tree region # int tol: accepted tolerance of tags d = {} index = 0 # fill the dictionary with differences and their occurences for tag in reg.iter(): if not self._tagfilter(tag): continue if self._is_deliv_anch(tag) and not index == 0: try: d[index] += 1 except: d[index] = 1 index = 0 index += 1 # check differencies if the variety isn't higher then $tol tolerance difflist = d.keys() self.__debug("difflist: "+str(difflist)) if len(difflist) == 0: return -1 _max = max(difflist) _min = min(difflist) dlen = len(d.keys()) if dlen == 1: return d.keys()[0] if dlen > ((2*tol)+1): # tolerance to both sides return -1 if (_max - _min) > 2*tol: # some acceptable tolerance return -1 # get the most frequent difference most_freq = max(d.values()) for key in d: if d[key] == most_freq: return key return -1 """ Only anchors found. No optional information. """ def _get_anch_only(self): anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree) # We have to make list of list because XMLOutput return [[anch] for anch in anchlist] """ Main method handling tag sequences and recognizing records. Returns list of records. """ def _get_tag_sequences(self, tag_tol=1): records = [] self._rec = [] if len(self.parentetree) == 0: return [[self.parentetree]] # get interval between anchors, use tolerance tag_tol self.difference = self._getdiff(self.parentetree, self.tagtol) while self.difference == -1: if self.tagtol > 5: self.__verbose("Variety of intervals between anchors is too huge. "+\ "Getting data out of anchors only") return self._get_anch_only() self.tagtol += 1 self.difference = self._getdiff(self.parentetree, self.tagtol) # get sequence of first n tags, where n is average interval between anchors # this could be tag-sequence describing all records in region. self.record_seq = [] i = 0 for tag in self.parentetree.iter(): if not self._tagfilter(tag): continue if i >= self.difference: if not 'a' in self.record_seq: del self.record_seq[0] else: break self.record_seq.append(tag.tag) i += 1 # counter indicates on which position in tag sequence we actually are counter = 0 # make sequence of tags as they go regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()]) recseqlen = len(self.record_seq) reglistlen = len(regionlist) # flag indicating begin of records - in region on the beginning can be some garbage self.begin = False # indicating unpredictable separator between deliverable records self.separator = 0 for i, tag in enumerate(regionlist): # skip and save the sequence at the end if counter > self.difference-1: records.append(self._rec) # save self._rec = [] # erase the list counter = 0 # reset counter if not self.begin: if tag.tag != self.record_seq[0]: continue else: try: if regionlist[i+1].tag != self.record_seq[1]: continue except: pass self.begin = True # handle tolerances, try to compare sibling tags self.match = False # match flag # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence. for tol in range(self.tagtol+1): if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \ regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter += tol+1 break elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \ regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]: self.match = True self._rec.append(tag) counter -= tol counter += 1 break # if nothing matched, its probably out of tolerance if not self.match: self.separator += 1 # tolerance 10 separators (tags between boxes or tables of deliverables) if self.separator > 10: self.__verbose("Tag sequence doesnt match, probably out of "+\ "tolerance, getting data out of anchors only") # maybe here could be tolerance++ # we didnt catch the sequence with tolerance... return self._get_anch_only() records.append(self._rec) return filter(self._validseq, records) """ Helper method - check if sequence of tags rec contains deliv anchor """ def _validseq(self, rec): for _atr in rec: # if we have anchor containing link to document, return true if self._is_deliv_anch(_atr): return True return False """ Get element texts only, dont look for descendants texts """ def _get_tag_content(self, tag): links = [] texts = [] if tag.tag == 'a': href = tag.get('href') # if link leading to document found, add string to list if href is not None and self.agent.is_wanted_mime(href): links.append(self.formatter.format(href)) title = tag.get('title') # if title found in tag, add string to list if title: texts.append(self.formatter.format(title)) # if not anchor, search text in tag.text if tag.text: if re.search("[a-z0-9]", tag.text, re.I): texts.append(self.formatter.format(tag.text)) return [links,texts] """ Harvest texts out of tags and return list of lists (record) """ def _harvest_text(self, record_tag_list): self._records = [] self._rec = [] _links = [] _texts = [] # loop over records and search all possible useful texts for rec_list in record_tag_list: for tag in rec_list: harvested = (self._get_tag_content(tag)) _links.extend(harvested[0]) _texts.extend(harvested[1]) #self._records.append(self._rec) res = self._deliv_in_text(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInText.append(res) self.__debug("Record found cause of text") else: res = self._deliv_in_link(_texts, _links) if type(res) == RRSPublication: self._entriesFoundInLinks.append(res) self.__debug("Record found cause of link") _links = [] _texts = [] self._rec = [] return self._records """ Text harvesting for sequences. """ def _handle_sequence(self): seq = self._get_tag_sequences() return self._harvest_text(seq) """ Get records from region according document links this method is used when there was no records found in output of sequencewrapper""" def _manual_process_page(self, links, baseurl): _err = None self.baseUrl = baseurl for link in links: # find region with tolerance self.parentetree = self.regionHandler.get_region(link, baseurl, 1) if type(self.parentetree) == tuple: # error _err = self.parentetree self.__debug(_err) continue #make all links absolute in parent tree hrefs = self.parentetree.findall('.//a[@href]') for href in hrefs: href.make_links_absolute('http://'+urlsplit(link)[1]+'/') # get the charset. We dont have etree in htmlHandler, # so we have to use the one from regionHandler self.formatter.set_charset(self.regionHandler.formatter.get_charset()) self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40) self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True)) # get root tag try: self.parentetree = self.parentetree.getroot() except: pass # Parent tag is table # call _handle_table if self.parentetree.tag in ('table','tbody'): self.__verbose("Handling table") self._handle_table() else: self.__verbose("Handling sequences") self._handle_sequence() #############PUBLIC METHODS TO GET RESULTS def get_deliverables_XML(self): """return infromations about deliverables stored in objects as xml""" if len(self.get_deliverables())==0: return derrno.__err__(derrno.ENOREC) output = StringIO.StringIO() converter = Model2XMLConverter(stream=output) converter.convert(self.get_deliverables()) result = output.getvalue() output.close() return result def get_deliverables(self): """return objects containing infromations""" if len(self._records) == 0: return derrno.__err__(derrno.ENOREC) else: return self._records
#urls = ['http://www.ualberta.ca/~bhan/publ.htm'] #urls = ['http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1'] urls = [ 'http://kaminari.scitec.kobe-u.ac.jp/pub_en.html', 'http://www.cis.uab.edu/sprague/', 'http://www2.lifl.fr/~carle/old/mabib.htm', 'http://www.poli.usp.br/p/fabio.cozman/', 'http://www.cs.washington.edu/homes/weld/pubs.html', 'http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1' ] #urls = ['https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=2&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=3&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=4&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=5&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=6&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=7&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=8&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=9&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=10&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=11&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=12&lang=1'] #urls = ['http://www.cs.princeton.edu/~schapire/publist.html'] #urls = ['http://bionum.cs.purdue.edu/p.html'] #urls = ['http://www.awissenet.eu/publications.aspx'] #urls = ['http://www.fit.vutbr.cz/~smrz/pubs.php.en'] #urls = ['http://dbis-group.uni-muenster.de/conferences/?searchTerm=&sortBy=start&sortDirection=&dateRange=previous&button_search=Search'] #urls = ['http://www.ws-i.org/deliverables/'] #urls = ['http://ce.et.tudelft.nl/~george/publications/publications.html'] #urls = ['http://www.isi.edu/~johnh/PAPERS/index.html'] #urls = ['http://www.chimie.ens.fr/hynes/publications.php'] urls = ['http://www.fit.vutbr.cz/~smrz/pubs.php'] c.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) pages = c.start(urls) for u in urls: d = sp.wrap_h(pages[u], u) print sp.get_xml()
#urls = ['http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1'] urls = ['http://kaminari.scitec.kobe-u.ac.jp/pub_en.html', 'http://www.cis.uab.edu/sprague/', 'http://www2.lifl.fr/~carle/old/mabib.htm', 'http://www.poli.usp.br/p/fabio.cozman/', 'http://www.cs.washington.edu/homes/weld/pubs.html', 'http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1' ] #urls = ['https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=2&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=3&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=4&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=5&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=6&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=7&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=8&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=9&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=10&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=11&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=12&lang=1'] #urls = ['http://www.cs.princeton.edu/~schapire/publist.html'] #urls = ['http://bionum.cs.purdue.edu/p.html'] #urls = ['http://www.awissenet.eu/publications.aspx'] #urls = ['http://www.fit.vutbr.cz/~smrz/pubs.php.en'] #urls = ['http://dbis-group.uni-muenster.de/conferences/?searchTerm=&sortBy=start&sortDirection=&dateRange=previous&button_search=Search'] #urls = ['http://www.ws-i.org/deliverables/'] #urls = ['http://ce.et.tudelft.nl/~george/publications/publications.html'] #urls = ['http://www.isi.edu/~johnh/PAPERS/index.html'] #urls = ['http://www.chimie.ens.fr/hynes/publications.php'] urls = ['http://www.fit.vutbr.cz/~smrz/pubs.php'] c.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) pages = c.start(urls) for u in urls: d = sp.wrap_h(pages[u], u) print sp.get_xml()