def run(self, page_dict): article_xpath = comment_xpaths = post_xpaths = None content = page_dict.get("content") sd = SDAlgorithm() sd.content = content type, article, comments, multiple = sd.analyze_page() if type in ['article', 'comment']: article_xpath = element2path(sd.tree, article.root_node) # create a few variants of the xpath article_xpaths = standardizexpath(sd.tree, article_xpath) #print '\n'.join(str(path) for path in article_xpaths) if type in ['comment']: comment_root_paths = [] for comment in comments: comment_root_paths.append( element2path(sd.tree, comment.root_node)) # look for the regularity in the comments' paths comment_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside( comment_root_paths) if diff_middle.isdigit(): #print 'regularity found' commenttemplatepath = common_beg + xpathops.STANDARD_REPLACEMENT_STRING + common_end # create a few variants of the xpath comment_xpaths = findcommonstandardxpath( sd.tree, commenttemplatepath, comment_root_paths) #print '\n'.join(str(path) for path in comment_xpaths) if type in ['multiple']: posts_root_paths = [] for post in multiple: posts_root_paths.append(element2path(sd.tree, post.root_node)) # look for the regularity in the comments' paths posts_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside( posts_root_paths) if diff_middle.isdigit(): #print 'regularity found' posttemplatepath = common_beg + xpathops.STANDARD_REPLACEMENT_STRING + common_end # create a few variants of the xpath post_xpaths = findcommonstandardxpath(sd.tree, posttemplatepath, posts_root_paths) #print '\n'.join(str(path) for path in post_xpaths) return article_xpath, comment_xpaths, post_xpaths
def find(self,params): tree = self.page_dict.get("tree") candidates = [] candidates.extend(self.__searchInputNodes()) candidates.extend(self.__searchAnchorNodes()) candidates.extend(self.__searchButtonNodes()) candidates.extend(self.__searchImageNodes()) if candidates: sorted_candidates = sorted(candidates,key=lambda x: -x[1]) # for elem in sorted_candidates: # print elem[0],'->',elem[1] # print html.tostring(elem[0]) # print '#######################################################\n' element = sorted_candidates[0] elements_list = [element] for i in range(1,len(candidates)): if element[1] == candidates[i][1]: elements_list.append(candidates[i]) else: break self.nodepath = [] for elem in elements_list: self.nodepath.append(xpathops.element2path(tree, elem[0])) self.features = [element[1]] self.certainty = 1 if element[1] >= 1 else 0 return self.nodepath else: self.features = [0] self.certainty = 0 self.nodepath = None return self.nodepath
def run(self, page_dict): article_xpath = comment_xpaths = post_xpaths = None content = page_dict.get("content") sd = SDAlgorithm() sd.content = content type,article,comments,multiple = sd.analyze_page() if type in ['article','comment']: article_xpath = element2path(sd.tree,article.root_node) # create a few variants of the xpath article_xpaths = standardizexpath(sd.tree,article_xpath) #print '\n'.join(str(path) for path in article_xpaths) if type in ['comment']: comment_root_paths = [] for comment in comments: comment_root_paths.append(element2path(sd.tree,comment.root_node)) # look for the regularity in the comments' paths comment_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside(comment_root_paths) if diff_middle.isdigit(): #print 'regularity found' commenttemplatepath = common_beg+xpathops.STANDARD_REPLACEMENT_STRING+common_end # create a few variants of the xpath comment_xpaths = findcommonstandardxpath(sd.tree,commenttemplatepath,comment_root_paths) #print '\n'.join(str(path) for path in comment_xpaths) if type in ['multiple']: posts_root_paths = [] for post in multiple: posts_root_paths.append(element2path(sd.tree,post.root_node)) # look for the regularity in the comments' paths posts_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside(posts_root_paths) if diff_middle.isdigit(): #print 'regularity found' posttemplatepath = common_beg+xpathops.STANDARD_REPLACEMENT_STRING+common_end # create a few variants of the xpath post_xpaths = findcommonstandardxpath(sd.tree,posttemplatepath,posts_root_paths) #print '\n'.join(str(path) for path in post_xpaths) return article_xpath, comment_xpaths, post_xpaths
def classify_page(self): """ Characterize the page according to i) has main article (has_article()), ii) has main article with comments (is_full_article()), iii) has multiple opinions like a forum (is_discussion()). """ validated = False [biggest_regions, grouped_comments] = self.group_regions() [article_exists, article] = self.has_article(biggest_regions) if article_exists: max_group = self.get_candidate_article(article, grouped_comments) if grouped_comments.has_key(max_group): if grouped_comments != {}: validated = self.candidate_group_level_validated( max_group, article, grouped_comments) context_validated = self.candidate_context_validated( article, grouped_comments, max_group) if self.big_areas_in_same_level(article, grouped_comments, max_group) and not validated: print Tcolors.INFO + " Multiple similar regions detected!" print "Class: " print Tcolors.RES + " " + grouped_comments[max_group][ 0].class_name print "Texts: " for reg in grouped_comments[max_group]: print element2path(reg.tree, reg.root_node), reg.full_text return None, None, grouped_comments[max_group] elif not context_validated: print self.print_article(article) print print Tcolors.INFO + " No comments found." return article, None, None elif context_validated: print print Tcolors.INFO + " Article with comments detected!" self.print_article(article) print print "Comment class:" print Tcolors.RES + " " + max_group print "Comments:" for com in grouped_comments[max_group]: print element2path(com.tree, com.root_node), com.full_text return article, grouped_comments[max_group], None else: self.print_article(article) return article, None, None else: print Tcolors.INFO + " Multiple similar regions detected!" print Tcolors.RES print "Texts: " for reg in biggest_regions: print element2path(reg.tree, reg.root_node), reg.full_text return None, None, biggest_regions
def process_multiple_details(self, multiple): #content_list = [] density_list = [] distance_list = [] multiple_article_root_paths = [] for article in multiple: #content_list.append(article.contents) multiple_article_root_paths.append(element2path(self.sd.tree, article.root_node)) density_list.append(article.density) distance_list.append(article.distance_from_root) return multiple_article_root_paths, density_list, distance_list
def process_comment_details(self, comments): #content_list = [] comment_root_paths = [] density_list = [] distance_list = [] for comment in comments: #content_list.append(comment.contents) comment_root_paths.append(element2path(self.sd.tree, comment.root_node)) density_list.append(comment.density) distance_list.append(comment.disance_from_root) return comment_root_paths, density_list, distance_list
def verify_comments(self, comments, tree): comment_root_paths = [] for com in comments: comment_root_paths.append(element2path(tree, com.root_node)) comment_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside(comment_root_paths) if diff_middle.isdigit(): print 'Comments: regularity found' return True else: return False
def verify_comments(self, comments, tree): comment_root_paths = [] for com in comments: comment_root_paths.append(element2path(tree, com.root_node)) comment_root_paths, common_beg, diff_middle, common_end = stringops.find_difference_inside( comment_root_paths) if diff_middle.isdigit(): print 'Comments: regularity found' return True else: return False
def verify_multiple_articles(self, mulart, url, tree, content): mul_art_root_paths = [] multiple_article_text = [] for mul in mulart: multiple_article_text.append(mul.full_text.encode('utf-8', 'ignore')) mul_art_root_paths.append(element2path(tree, mul.root_node)) mul_art_root_paths, multiple_article_text = self.leave_roots_only(mul_art_root_paths, multiple_article_text) if(len(mul_art_root_paths)>1): struct_verified = self.verify_multiple_articles_pagetags_structure(mul_art_root_paths) href_verified = self.verify_multiple_article_hrefs(url, content) sim_text_verified = self.verify_similar_text(multiple_article_text, url) if struct_verified and href_verified and sim_text_verified: return True else: return False else: return False # but actually it means : that's an article
def verify_multiple_articles(self, mulart, url, tree, content): mul_art_root_paths = [] multiple_article_text = [] for mul in mulart: multiple_article_text.append( mul.full_text.encode('utf-8', 'ignore')) mul_art_root_paths.append(element2path(tree, mul.root_node)) mul_art_root_paths, multiple_article_text = self.leave_roots_only( mul_art_root_paths, multiple_article_text) if (len(mul_art_root_paths) > 1): struct_verified = self.verify_multiple_articles_pagetags_structure( mul_art_root_paths) href_verified = self.verify_multiple_article_hrefs(url, content) sim_text_verified = self.verify_similar_text( multiple_article_text, url) if struct_verified and href_verified and sim_text_verified: return True else: return False else: return False # but actually it means : that's an article