コード例 #1
0
ファイル: text.py プロジェクト: vuamitom/pyGoose
 def removetagswithfewwords(self, topnode):
     for item in topnode.iterdescendants():
         itemtext = util.getinnertext(item, True)
         wordcount = self.texthandler.getwordscount(util.getinnertext(item,True))
         if wordcount < 25:
             try:
                 next(item.iterdescendants('object'))
                 next(item.iterdescendants('embed'))
             except StopIteration:
                 item.getparent().remove(item)
コード例 #2
0
ファイル: text.py プロジェクト: kingsir2011/pyGoose
 def removetagswithfewwords(self, topnode):
     for item in topnode.iterdescendants():
         itemtext = util.getinnertext(item, True)
         wordcount = self.texthandler.getwordscount(
             util.getinnertext(item, True))
         if wordcount < 25:
             try:
                 next(item.iterdescendants('object'))
                 next(item.iterdescendants('embed'))
             except StopIteration:
                 item.getparent().remove(item)
コード例 #3
0
ファイル: extractor.py プロジェクト: vuamitom/pyGoose
    def getsiblingcontent(self, currentsibling, basescore):
        if currentsibling.tag == 'p':
            siblingtext = util.getinnertext(currentsibling, True)
            if siblingtext is not None and len(util.getinnertext(currentsibling)) > 0:
                return util.getouterhtml(currentsibling)
        alltext = [] 
        for para in currentsibling.iterdescendants('p'):#self.xparas(currentsibling):
            text = util.getinnertext(para)
            if text and len(text) > 0 :
                ws = self.texthandler.getstopwordscount(text)
                parascore = ws.stopwordcount
                if basescore * 0.30 < parascore:
                    alltext.append("<p>" + text + "</p>")

        if len(alltext) > 0:
            return " ".join(alltext)
        else:
            return None
コード例 #4
0
ファイル: extractor.py プロジェクト: kingsir2011/pyGoose
    def getsiblingcontent(self, currentsibling, basescore):
        if currentsibling.tag == 'p':
            siblingtext = util.getinnertext(currentsibling, True)
            if siblingtext is not None and len(
                    util.getinnertext(currentsibling)) > 0:
                return util.getouterhtml(currentsibling)
        alltext = []
        for para in currentsibling.iterdescendants(
                'p'):  #self.xparas(currentsibling):
            text = util.getinnertext(para)
            if text and len(text) > 0:
                ws = self.texthandler.getstopwordscount(text)
                parascore = ws.stopwordcount
                if basescore * 0.30 < parascore:
                    alltext.append("<p>" + text + "</p>")

        if len(alltext) > 0:
            return " ".join(alltext)
        else:
            return None
コード例 #5
0
ファイル: text.py プロジェクト: vuamitom/pyGoose
    def totext(self,topnode):
        buff = []
        for child in topnode.iterchildren():
            logging.debug("\n ==== tag = %s id = %s class = %s " % (child.tag, child.get('id'), child.get('class')))
            content = util.getinnertext(child,True)
            logging.debug(content)
            if content:
                buff.append(content)

        if len(buff) > 0:
            return "\n\n".join(buff)
        else:
            return None
コード例 #6
0
ファイル: text.py プロジェクト: vuamitom/pyGoose
 def removetagswithfewwords(self, topnode):
     """tags with fewer words than a threshold could be noise"""
     for item in topnode.iterdescendants():
         #ws = self.texthandler.getstopwordscount(util.getinnertext(item, True))
         itemtext = util.getinnertext(item, True)
         if itemtext is not None and self.texthandler.gettextscore(itemtext) < self.texthandler.getcutoff():
             try:
                 next(item.iterdescendants('object'))
                 next(item.iterdescendants('embed'))
             except StopIteration:
                 #remove node with there is no <object> <embed> tags
                 logging.debug("remove fewwordpara %s: %s "%(item.tag, item.text))
                 item.getparent().remove(item) 
コード例 #7
0
ファイル: text.py プロジェクト: kingsir2011/pyGoose
    def totext(self, topnode):
        buff = []
        for child in topnode.iterchildren():
            logging.debug("\n ==== tag = %s id = %s class = %s " %
                          (child.tag, child.get('id'), child.get('class')))
            content = util.getinnertext(child, True)
            logging.debug(content)
            if content:
                buff.append(content)

        if len(buff) > 0:
            return "\n\n".join(buff)
        else:
            return None
コード例 #8
0
ファイル: text.py プロジェクト: kingsir2011/pyGoose
 def removetagswithfewwords(self, topnode):
     """tags with fewer words than a threshold could be noise"""
     for item in topnode.iterdescendants():
         #ws = self.texthandler.getstopwordscount(util.getinnertext(item, True))
         itemtext = util.getinnertext(item, True)
         if itemtext is not None and self.texthandler.gettextscore(
                 itemtext) < self.texthandler.getcutoff():
             try:
                 next(item.iterdescendants('object'))
                 next(item.iterdescendants('embed'))
             except StopIteration:
                 #remove node with there is no <object> <embed> tags
                 logging.debug("remove fewwordpara %s: %s " %
                               (item.tag, item.text))
                 item.getparent().remove(item)
コード例 #9
0
ファイル: testrun.py プロジェクト: vuamitom/pyGoose
def main():
    #url = 'http://localhost/projects/pyGoose/target.html'
    url = 'http://vietnamnet.vn/vn/van-hoa/84115/xoa-an-cam-bieu-dien-voi-trong-tan-anh-tho.html'
    #url = 'http://www.google.co.in'
    config = Configuration()
    #parsing config as param to crawlcandidate maynot be 
    config.contentextractor = StandardContentExtractor
    #config.contentextractor = ContentExtractor
    #config.formatter = LengthbsdFormatter
    config.texthandler = LengthbsdTextHandler
    crawlcandidate = CrawlCandidate(config,url)

    crawler = Crawler(config)
    article = crawler.crawl(crawlcandidate)
    logging.debug(getinnertext(article.topnode, True))
    #logging.debug(getouterhtml(article.topnode))
    print (article.title)
コード例 #10
0
ファイル: testrun.py プロジェクト: kingsir2011/pyGoose
def main():
    #url = 'http://localhost/projects/pyGoose/target.html'
    url = 'http://vietnamnet.vn/vn/van-hoa/84115/xoa-an-cam-bieu-dien-voi-trong-tan-anh-tho.html'
    #url = 'http://www.google.co.in'
    config = Configuration()
    #parsing config as param to crawlcandidate maynot be
    config.contentextractor = StandardContentExtractor
    #config.contentextractor = ContentExtractor
    #config.formatter = LengthbsdFormatter
    config.texthandler = LengthbsdTextHandler
    crawlcandidate = CrawlCandidate(config, url)

    crawler = Crawler(config)
    article = crawler.crawl(crawlcandidate)
    logging.debug(getinnertext(article.topnode, True))
    #logging.debug(getouterhtml(article.topnode))
    print(article.title)
コード例 #11
0
ファイル: extractor.py プロジェクト: vuamitom/pyGoose
    def istablenopara(self, node):
        for subpara in node.iterdescendants('p'):
            paratext = util.getinnertext(subpara,True) 
            if paratext is None or len(paratext) < 25:
                parent = subpara.getparent()
                if parent:
                    logging.debug("removing node %s" % subpara.tag)
                    parent.remove(subpara)

        #subparas = self.xparas(node)
        iterpar = node.iterdescendants('p')
        try:
            p = next(iterpar)
            return False
        except StopIteration:
            logging.debug("YES")
            if node.tag != 'td':
                return True
コード例 #12
0
ファイル: extractor.py プロジェクト: kingsir2011/pyGoose
    def istablenopara(self, node):
        for subpara in node.iterdescendants('p'):
            paratext = util.getinnertext(subpara, True)
            if paratext is None or len(paratext) < 25:
                parent = subpara.getparent()
                if parent:
                    logging.debug("removing node %s" % subpara.tag)
                    parent.remove(subpara)

        #subparas = self.xparas(node)
        iterpar = node.iterdescendants('p')
        try:
            p = next(iterpar)
            return False
        except StopIteration:
            logging.debug("YES")
            if node.tag != 'td':
                return True
コード例 #13
0
ファイル: extractor.py プロジェクト: vuamitom/pyGoose
    def isboostable(self, node):
        """ make sure that the node is a paragraph, and connected to other paragraph """
        stepsaway = 0 
        minscoretoboost = 50
        maxstepsaway = 3
        for sib in node.itersiblings(preceding=True):
            if(sib.tag == 'p'): 
                if stepsaway >= maxstepsaway:
                    logging.debug("Next paragraph is too farway, not boost")
                    return False
                paratext = util.getinnertext(sib) 
                if paratext != None:
                    #ws = self.texthandler.getstopwordscount(paratext)
                    if self.getrelevancescore(paratext) > minscoretoboost:
                        logging.debug("Boosting this node")
                        return True
                stepsaway += 1

        return False
コード例 #14
0
ファイル: extractor.py プロジェクト: vuamitom/pyGoose
    def getbaselinescoreforsiblings(self, topnode):
        """get base score against average scoring of paragraphs within topnodes. Siblings must have higher score than baseline"""
        base = 100000
        numparas = 0
        scoreparas = 0 
        #nodestocheck = self.xparas(topnode)
        for node in topnode.iterdescendants('p'):
            nodetext = util.getinnertext(node)
            #ws = self.texthandler.getstopwordscount(nodetext)
            if nodetext:
                relscore = self.getrelevancescore(nodetext)
                linkdense = self.ishighlinkdensity(topnode)
                if(relscore > self.getcutoffscore() and not linkdense):
                    numparas += 1
                    scoreparas += relscore#ws.stopwordcount

        if numparas > 0:
            base = scoreparas/ numparas
        return base
コード例 #15
0
ファイル: extractor.py プロジェクト: kingsir2011/pyGoose
    def getbaselinescoreforsiblings(self, topnode):
        """get base score against average scoring of paragraphs within topnodes. Siblings must have higher score than baseline"""
        base = 100000
        numparas = 0
        scoreparas = 0
        #nodestocheck = self.xparas(topnode)
        for node in topnode.iterdescendants('p'):
            nodetext = util.getinnertext(node)
            #ws = self.texthandler.getstopwordscount(nodetext)
            if nodetext:
                relscore = self.getrelevancescore(nodetext)
                linkdense = self.ishighlinkdensity(topnode)
                if (relscore > self.getcutoffscore() and not linkdense):
                    numparas += 1
                    scoreparas += relscore  #ws.stopwordcount

        if numparas > 0:
            base = scoreparas / numparas
        return base
コード例 #16
0
ファイル: extractor.py プロジェクト: kingsir2011/pyGoose
    def isboostable(self, node):
        """ make sure that the node is a paragraph, and connected to other paragraph """
        stepsaway = 0
        minscoretoboost = 50
        maxstepsaway = 3
        for sib in node.itersiblings(preceding=True):
            if (sib.tag == 'p'):
                if stepsaway >= maxstepsaway:
                    logging.debug("Next paragraph is too farway, not boost")
                    return False
                paratext = util.getinnertext(sib)
                if paratext != None:
                    #ws = self.texthandler.getstopwordscount(paratext)
                    if self.getrelevancescore(paratext) > minscoretoboost:
                        logging.debug("Boosting this node")
                        return True
                stepsaway += 1

        return False
コード例 #17
0
ファイル: extractor.py プロジェクト: vuamitom/pyGoose
    def ishighlinkdensity(self, node):
        """check if a node contains lots of links"""
        text = util.getinnertext(node, True) 
        if not text:
            return False
        words = self.texthandler.splittext(text)
        linkbuffer = [] 
        for link in node.iterdescendants('a'):
            if link.text != None:
                linkbuffer.append(link.text)
        if len(linkbuffer) == 0:
            return False

        linktext = ' '.join(linkbuffer) 
        linkwords = self.texthandler.splittext(linktext)

        linkdivisor = len(linkwords)/len(words)
        score = linkdivisor * len(linkbuffer)
        
        logging.debug("Link density score is %f for node %s"%(score, self._getshorttext(node)))
        return score > 1
コード例 #18
0
ファイル: extractor.py プロジェクト: kingsir2011/pyGoose
    def ishighlinkdensity(self, node):
        """check if a node contains lots of links"""
        text = util.getinnertext(node, True)
        if not text:
            return False
        words = self.texthandler.splittext(text)
        linkbuffer = []
        for link in node.iterdescendants('a'):
            if link.text != None:
                linkbuffer.append(link.text)
        if len(linkbuffer) == 0:
            return False

        linktext = ' '.join(linkbuffer)
        linkwords = self.texthandler.splittext(linktext)

        linkdivisor = len(linkwords) / len(words)
        score = linkdivisor * len(linkbuffer)

        logging.debug("Link density score is %f for node %s" %
                      (score, self._getshorttext(node)))
        return score > 1
コード例 #19
0
ファイル: extractor.py プロジェクト: vuamitom/pyGoose
    def getbestnodes_bsdoncluster(self, doc):
        nodeswithtext = []
        parentnodes = []
        nodes = self.getnodestocheck(doc)
        startboost = 1.0
        count = 0 
        i = 0 #iteration 
        for node in nodes:
            #logging.debug("checking %s node id=%s class=%s "% (node.tag, node.get('id'), node.get('class')))
            #logging.debug(util.getouterhtml(node))
            #logging.debug("\n")
            nodetext = util.getinnertext(node)
            if nodetext!= None:
                #wordstats = self.texthandler.getstopwordscount(nodetext)
                linkdense = self.ishighlinkdensity(node)
                if(self.getrelevancescore(nodetext) > self.getcutoffscore() and not linkdense ):
                    nodeswithtext.append(node)

        logging.debug("To inspect %d nodes with text " % len(nodeswithtext))
        negativescore = 0
        bottomnode_for_negativescore = len(nodeswithtext) * 0.25
        for node in nodeswithtext:
            boostscore = 0 
            if self.isboostable(node) : 
                if count >= 0:
                    boostscore = ( 1.0/ startboost * 50)
                    startboost += 1
            if len(nodeswithtext) > 15:
                # for nodes that fall in bottom 25%
                if (len(nodeswithtext) - i) <= bottomnode_for_negativescore:
                    booster = bottomnode_for_negativescore - (len(nodeswithtext) - i )
                    boostscore = -math.pow(booster, 2)
                    negscore = math.fabs(boostscore) + negativescore
                    if negscore > 40:
                        boostscore  = 5
            logging.debug("Location boost score %d on iteration %d id='%s' class='%s' tag='%s'" % (boostscore, i, node.getparent().get('id'), node.getparent().get('class'), node.getparent().tag ))
            
            nodetext = util.getinnertext(node) 
            #logging.debug(nodetext)
            #ws = self.texthandler.getstopwordscount(nodetext)
            #upscore = ws.stopwordcount + boostscore
            upscore = self.getrelevancescore(nodetext) + boostscore
            logging.debug("total upscore = %f " % upscore ) 
            parent = node.getparent()
            grandpar = node.getparent().getparent()
            self._score(parent, upscore)
            self._score(grandpar, upscore/2)
            self._nodecount(parent, 1)
            self._nodecount(grandpar,1)
                
            try:
                parentnodes.index(parent)
            except ValueError:
                parentnodes.append(parent)

            try:
                parentnodes.index(grandpar)
            except ValueError:
                parentnodes.append(grandpar)

            count += 1
            i += 1

        topnodescore = 0
        topnode = None
        for node in parentnodes:
            logging.debug("Parent Node: score=%s nodeCount=%s id=%s class=%s tag=%s" % (self._score(node),self._nodecount(node),node.get('id'),node.get('class'), node.tag))
            score = self._score(node)
            if score > topnodescore:
                topnode = node
                topnodescore = score

            if topnode is None: 
                topnode = node
        return topnode
コード例 #20
0
ファイル: extractor.py プロジェクト: kingsir2011/pyGoose
    def getbestnodes_bsdoncluster(self, doc):
        nodeswithtext = []
        parentnodes = []
        nodes = self.getnodestocheck(doc)
        startboost = 1.0
        count = 0
        i = 0  #iteration
        for node in nodes:
            #logging.debug("checking %s node id=%s class=%s "% (node.tag, node.get('id'), node.get('class')))
            #logging.debug(util.getouterhtml(node))
            #logging.debug("\n")
            nodetext = util.getinnertext(node)
            if nodetext != None:
                #wordstats = self.texthandler.getstopwordscount(nodetext)
                linkdense = self.ishighlinkdensity(node)
                if (self.getrelevancescore(nodetext) > self.getcutoffscore()
                        and not linkdense):
                    nodeswithtext.append(node)

        logging.debug("To inspect %d nodes with text " % len(nodeswithtext))
        negativescore = 0
        bottomnode_for_negativescore = len(nodeswithtext) * 0.25
        for node in nodeswithtext:
            boostscore = 0
            if self.isboostable(node):
                if count >= 0:
                    boostscore = (1.0 / startboost * 50)
                    startboost += 1
            if len(nodeswithtext) > 15:
                # for nodes that fall in bottom 25%
                if (len(nodeswithtext) - i) <= bottomnode_for_negativescore:
                    booster = bottomnode_for_negativescore - (
                        len(nodeswithtext) - i)
                    boostscore = -math.pow(booster, 2)
                    negscore = math.fabs(boostscore) + negativescore
                    if negscore > 40:
                        boostscore = 5
            logging.debug(
                "Location boost score %d on iteration %d id='%s' class='%s' tag='%s'"
                % (boostscore, i, node.getparent().get('id'),
                   node.getparent().get('class'), node.getparent().tag))

            nodetext = util.getinnertext(node)
            #logging.debug(nodetext)
            #ws = self.texthandler.getstopwordscount(nodetext)
            #upscore = ws.stopwordcount + boostscore
            upscore = self.getrelevancescore(nodetext) + boostscore
            logging.debug("total upscore = %f " % upscore)
            parent = node.getparent()
            grandpar = node.getparent().getparent()
            self._score(parent, upscore)
            self._score(grandpar, upscore / 2)
            self._nodecount(parent, 1)
            self._nodecount(grandpar, 1)

            try:
                parentnodes.index(parent)
            except ValueError:
                parentnodes.append(parent)

            try:
                parentnodes.index(grandpar)
            except ValueError:
                parentnodes.append(grandpar)

            count += 1
            i += 1

        topnodescore = 0
        topnode = None
        for node in parentnodes:
            logging.debug(
                "Parent Node: score=%s nodeCount=%s id=%s class=%s tag=%s" %
                (self._score(node), self._nodecount(node), node.get('id'),
                 node.get('class'), node.tag))
            score = self._score(node)
            if score > topnodescore:
                topnode = node
                topnodescore = score

            if topnode is None:
                topnode = node
        return topnode