def removetagswithfewwords(self, topnode): for item in topnode.iterdescendants(): itemtext = util.getinnertext(item, True) wordcount = self.texthandler.getwordscount(util.getinnertext(item,True)) if wordcount < 25: try: next(item.iterdescendants('object')) next(item.iterdescendants('embed')) except StopIteration: item.getparent().remove(item)
def removetagswithfewwords(self, topnode): for item in topnode.iterdescendants(): itemtext = util.getinnertext(item, True) wordcount = self.texthandler.getwordscount( util.getinnertext(item, True)) if wordcount < 25: try: next(item.iterdescendants('object')) next(item.iterdescendants('embed')) except StopIteration: item.getparent().remove(item)
def getsiblingcontent(self, currentsibling, basescore): if currentsibling.tag == 'p': siblingtext = util.getinnertext(currentsibling, True) if siblingtext is not None and len(util.getinnertext(currentsibling)) > 0: return util.getouterhtml(currentsibling) alltext = [] for para in currentsibling.iterdescendants('p'):#self.xparas(currentsibling): text = util.getinnertext(para) if text and len(text) > 0 : ws = self.texthandler.getstopwordscount(text) parascore = ws.stopwordcount if basescore * 0.30 < parascore: alltext.append("<p>" + text + "</p>") if len(alltext) > 0: return " ".join(alltext) else: return None
def getsiblingcontent(self, currentsibling, basescore): if currentsibling.tag == 'p': siblingtext = util.getinnertext(currentsibling, True) if siblingtext is not None and len( util.getinnertext(currentsibling)) > 0: return util.getouterhtml(currentsibling) alltext = [] for para in currentsibling.iterdescendants( 'p'): #self.xparas(currentsibling): text = util.getinnertext(para) if text and len(text) > 0: ws = self.texthandler.getstopwordscount(text) parascore = ws.stopwordcount if basescore * 0.30 < parascore: alltext.append("<p>" + text + "</p>") if len(alltext) > 0: return " ".join(alltext) else: return None
def totext(self,topnode): buff = [] for child in topnode.iterchildren(): logging.debug("\n ==== tag = %s id = %s class = %s " % (child.tag, child.get('id'), child.get('class'))) content = util.getinnertext(child,True) logging.debug(content) if content: buff.append(content) if len(buff) > 0: return "\n\n".join(buff) else: return None
def removetagswithfewwords(self, topnode): """tags with fewer words than a threshold could be noise""" for item in topnode.iterdescendants(): #ws = self.texthandler.getstopwordscount(util.getinnertext(item, True)) itemtext = util.getinnertext(item, True) if itemtext is not None and self.texthandler.gettextscore(itemtext) < self.texthandler.getcutoff(): try: next(item.iterdescendants('object')) next(item.iterdescendants('embed')) except StopIteration: #remove node with there is no <object> <embed> tags logging.debug("remove fewwordpara %s: %s "%(item.tag, item.text)) item.getparent().remove(item)
def totext(self, topnode): buff = [] for child in topnode.iterchildren(): logging.debug("\n ==== tag = %s id = %s class = %s " % (child.tag, child.get('id'), child.get('class'))) content = util.getinnertext(child, True) logging.debug(content) if content: buff.append(content) if len(buff) > 0: return "\n\n".join(buff) else: return None
def removetagswithfewwords(self, topnode): """tags with fewer words than a threshold could be noise""" for item in topnode.iterdescendants(): #ws = self.texthandler.getstopwordscount(util.getinnertext(item, True)) itemtext = util.getinnertext(item, True) if itemtext is not None and self.texthandler.gettextscore( itemtext) < self.texthandler.getcutoff(): try: next(item.iterdescendants('object')) next(item.iterdescendants('embed')) except StopIteration: #remove node with there is no <object> <embed> tags logging.debug("remove fewwordpara %s: %s " % (item.tag, item.text)) item.getparent().remove(item)
def main(): #url = 'http://localhost/projects/pyGoose/target.html' url = 'http://vietnamnet.vn/vn/van-hoa/84115/xoa-an-cam-bieu-dien-voi-trong-tan-anh-tho.html' #url = 'http://www.google.co.in' config = Configuration() #parsing config as param to crawlcandidate maynot be config.contentextractor = StandardContentExtractor #config.contentextractor = ContentExtractor #config.formatter = LengthbsdFormatter config.texthandler = LengthbsdTextHandler crawlcandidate = CrawlCandidate(config,url) crawler = Crawler(config) article = crawler.crawl(crawlcandidate) logging.debug(getinnertext(article.topnode, True)) #logging.debug(getouterhtml(article.topnode)) print (article.title)
def main(): #url = 'http://localhost/projects/pyGoose/target.html' url = 'http://vietnamnet.vn/vn/van-hoa/84115/xoa-an-cam-bieu-dien-voi-trong-tan-anh-tho.html' #url = 'http://www.google.co.in' config = Configuration() #parsing config as param to crawlcandidate maynot be config.contentextractor = StandardContentExtractor #config.contentextractor = ContentExtractor #config.formatter = LengthbsdFormatter config.texthandler = LengthbsdTextHandler crawlcandidate = CrawlCandidate(config, url) crawler = Crawler(config) article = crawler.crawl(crawlcandidate) logging.debug(getinnertext(article.topnode, True)) #logging.debug(getouterhtml(article.topnode)) print(article.title)
def istablenopara(self, node): for subpara in node.iterdescendants('p'): paratext = util.getinnertext(subpara,True) if paratext is None or len(paratext) < 25: parent = subpara.getparent() if parent: logging.debug("removing node %s" % subpara.tag) parent.remove(subpara) #subparas = self.xparas(node) iterpar = node.iterdescendants('p') try: p = next(iterpar) return False except StopIteration: logging.debug("YES") if node.tag != 'td': return True
def istablenopara(self, node): for subpara in node.iterdescendants('p'): paratext = util.getinnertext(subpara, True) if paratext is None or len(paratext) < 25: parent = subpara.getparent() if parent: logging.debug("removing node %s" % subpara.tag) parent.remove(subpara) #subparas = self.xparas(node) iterpar = node.iterdescendants('p') try: p = next(iterpar) return False except StopIteration: logging.debug("YES") if node.tag != 'td': return True
def isboostable(self, node): """ make sure that the node is a paragraph, and connected to other paragraph """ stepsaway = 0 minscoretoboost = 50 maxstepsaway = 3 for sib in node.itersiblings(preceding=True): if(sib.tag == 'p'): if stepsaway >= maxstepsaway: logging.debug("Next paragraph is too farway, not boost") return False paratext = util.getinnertext(sib) if paratext != None: #ws = self.texthandler.getstopwordscount(paratext) if self.getrelevancescore(paratext) > minscoretoboost: logging.debug("Boosting this node") return True stepsaway += 1 return False
def getbaselinescoreforsiblings(self, topnode): """get base score against average scoring of paragraphs within topnodes. Siblings must have higher score than baseline""" base = 100000 numparas = 0 scoreparas = 0 #nodestocheck = self.xparas(topnode) for node in topnode.iterdescendants('p'): nodetext = util.getinnertext(node) #ws = self.texthandler.getstopwordscount(nodetext) if nodetext: relscore = self.getrelevancescore(nodetext) linkdense = self.ishighlinkdensity(topnode) if(relscore > self.getcutoffscore() and not linkdense): numparas += 1 scoreparas += relscore#ws.stopwordcount if numparas > 0: base = scoreparas/ numparas return base
def getbaselinescoreforsiblings(self, topnode): """get base score against average scoring of paragraphs within topnodes. Siblings must have higher score than baseline""" base = 100000 numparas = 0 scoreparas = 0 #nodestocheck = self.xparas(topnode) for node in topnode.iterdescendants('p'): nodetext = util.getinnertext(node) #ws = self.texthandler.getstopwordscount(nodetext) if nodetext: relscore = self.getrelevancescore(nodetext) linkdense = self.ishighlinkdensity(topnode) if (relscore > self.getcutoffscore() and not linkdense): numparas += 1 scoreparas += relscore #ws.stopwordcount if numparas > 0: base = scoreparas / numparas return base
def isboostable(self, node): """ make sure that the node is a paragraph, and connected to other paragraph """ stepsaway = 0 minscoretoboost = 50 maxstepsaway = 3 for sib in node.itersiblings(preceding=True): if (sib.tag == 'p'): if stepsaway >= maxstepsaway: logging.debug("Next paragraph is too farway, not boost") return False paratext = util.getinnertext(sib) if paratext != None: #ws = self.texthandler.getstopwordscount(paratext) if self.getrelevancescore(paratext) > minscoretoboost: logging.debug("Boosting this node") return True stepsaway += 1 return False
def ishighlinkdensity(self, node): """check if a node contains lots of links""" text = util.getinnertext(node, True) if not text: return False words = self.texthandler.splittext(text) linkbuffer = [] for link in node.iterdescendants('a'): if link.text != None: linkbuffer.append(link.text) if len(linkbuffer) == 0: return False linktext = ' '.join(linkbuffer) linkwords = self.texthandler.splittext(linktext) linkdivisor = len(linkwords)/len(words) score = linkdivisor * len(linkbuffer) logging.debug("Link density score is %f for node %s"%(score, self._getshorttext(node))) return score > 1
def ishighlinkdensity(self, node): """check if a node contains lots of links""" text = util.getinnertext(node, True) if not text: return False words = self.texthandler.splittext(text) linkbuffer = [] for link in node.iterdescendants('a'): if link.text != None: linkbuffer.append(link.text) if len(linkbuffer) == 0: return False linktext = ' '.join(linkbuffer) linkwords = self.texthandler.splittext(linktext) linkdivisor = len(linkwords) / len(words) score = linkdivisor * len(linkbuffer) logging.debug("Link density score is %f for node %s" % (score, self._getshorttext(node))) return score > 1
def getbestnodes_bsdoncluster(self, doc): nodeswithtext = [] parentnodes = [] nodes = self.getnodestocheck(doc) startboost = 1.0 count = 0 i = 0 #iteration for node in nodes: #logging.debug("checking %s node id=%s class=%s "% (node.tag, node.get('id'), node.get('class'))) #logging.debug(util.getouterhtml(node)) #logging.debug("\n") nodetext = util.getinnertext(node) if nodetext!= None: #wordstats = self.texthandler.getstopwordscount(nodetext) linkdense = self.ishighlinkdensity(node) if(self.getrelevancescore(nodetext) > self.getcutoffscore() and not linkdense ): nodeswithtext.append(node) logging.debug("To inspect %d nodes with text " % len(nodeswithtext)) negativescore = 0 bottomnode_for_negativescore = len(nodeswithtext) * 0.25 for node in nodeswithtext: boostscore = 0 if self.isboostable(node) : if count >= 0: boostscore = ( 1.0/ startboost * 50) startboost += 1 if len(nodeswithtext) > 15: # for nodes that fall in bottom 25% if (len(nodeswithtext) - i) <= bottomnode_for_negativescore: booster = bottomnode_for_negativescore - (len(nodeswithtext) - i ) boostscore = -math.pow(booster, 2) negscore = math.fabs(boostscore) + negativescore if negscore > 40: boostscore = 5 logging.debug("Location boost score %d on iteration %d id='%s' class='%s' tag='%s'" % (boostscore, i, node.getparent().get('id'), node.getparent().get('class'), node.getparent().tag )) nodetext = util.getinnertext(node) #logging.debug(nodetext) #ws = self.texthandler.getstopwordscount(nodetext) #upscore = ws.stopwordcount + boostscore upscore = self.getrelevancescore(nodetext) + boostscore logging.debug("total upscore = %f " % upscore ) parent = node.getparent() grandpar = node.getparent().getparent() self._score(parent, upscore) self._score(grandpar, upscore/2) self._nodecount(parent, 1) self._nodecount(grandpar,1) try: parentnodes.index(parent) except ValueError: parentnodes.append(parent) try: parentnodes.index(grandpar) except ValueError: parentnodes.append(grandpar) count += 1 i += 1 topnodescore = 0 topnode = None for node in parentnodes: logging.debug("Parent Node: score=%s nodeCount=%s id=%s class=%s tag=%s" % (self._score(node),self._nodecount(node),node.get('id'),node.get('class'), node.tag)) score = self._score(node) if score > topnodescore: topnode = node topnodescore = score if topnode is None: topnode = node return topnode
def getbestnodes_bsdoncluster(self, doc): nodeswithtext = [] parentnodes = [] nodes = self.getnodestocheck(doc) startboost = 1.0 count = 0 i = 0 #iteration for node in nodes: #logging.debug("checking %s node id=%s class=%s "% (node.tag, node.get('id'), node.get('class'))) #logging.debug(util.getouterhtml(node)) #logging.debug("\n") nodetext = util.getinnertext(node) if nodetext != None: #wordstats = self.texthandler.getstopwordscount(nodetext) linkdense = self.ishighlinkdensity(node) if (self.getrelevancescore(nodetext) > self.getcutoffscore() and not linkdense): nodeswithtext.append(node) logging.debug("To inspect %d nodes with text " % len(nodeswithtext)) negativescore = 0 bottomnode_for_negativescore = len(nodeswithtext) * 0.25 for node in nodeswithtext: boostscore = 0 if self.isboostable(node): if count >= 0: boostscore = (1.0 / startboost * 50) startboost += 1 if len(nodeswithtext) > 15: # for nodes that fall in bottom 25% if (len(nodeswithtext) - i) <= bottomnode_for_negativescore: booster = bottomnode_for_negativescore - ( len(nodeswithtext) - i) boostscore = -math.pow(booster, 2) negscore = math.fabs(boostscore) + negativescore if negscore > 40: boostscore = 5 logging.debug( "Location boost score %d on iteration %d id='%s' class='%s' tag='%s'" % (boostscore, i, node.getparent().get('id'), node.getparent().get('class'), node.getparent().tag)) nodetext = util.getinnertext(node) #logging.debug(nodetext) #ws = self.texthandler.getstopwordscount(nodetext) #upscore = ws.stopwordcount + boostscore upscore = self.getrelevancescore(nodetext) + boostscore logging.debug("total upscore = %f " % upscore) parent = node.getparent() grandpar = node.getparent().getparent() self._score(parent, upscore) self._score(grandpar, upscore / 2) self._nodecount(parent, 1) self._nodecount(grandpar, 1) try: parentnodes.index(parent) except ValueError: parentnodes.append(parent) try: parentnodes.index(grandpar) except ValueError: parentnodes.append(grandpar) count += 1 i += 1 topnodescore = 0 topnode = None for node in parentnodes: logging.debug( "Parent Node: score=%s nodeCount=%s id=%s class=%s tag=%s" % (self._score(node), self._nodecount(node), node.get('id'), node.get('class'), node.tag)) score = self._score(node) if score > topnodescore: topnode = node topnodescore = score if topnode is None: topnode = node return topnode