コード例 #1
0
ファイル: wordweight.py プロジェクト: piaofu110/docclustering
class WordWeightEvaluation:
    def __init__(self, kw_num=20, worddfdir='../data/worddf'):
        self.worddf = WordDF('r', worddfdir)
        self.docsize = self.worddf.doc_size()
        self.kwnum = kw_num

    def close(self):
        self.worddf.close()

    def extract_kw(self, title, content, seg_text=False):
        """ 
        给出一篇文本的标题和内容,输出关键词及相应权重的列表。
        如果seg_text为True,说明还没有分过词,需要分词处理
        """
        if seg_text:
            if title:
                title = self.seg_text(title)
            content = self.seg_text(content)

        word_dict = dict()
        if title:
            self.__stats_tf(word_dict, title, coef=2.0)
        self.__stats_tf(word_dict, content, coef=1.0)

        #sum_tf = sum(word_dict.itervalues())
        for w, v in word_dict.iteritems():
            df = self.worddf.df(w) + 1.0
            idf = math.log(self.docsize / df, 2)
            ntf = v * idf
            word_dict[w] = ntf
        wwlist = heapq.nlargest(self.kwnum,
                                word_dict.iteritems(),
                                key=lambda x: x[1])
        sumw = sum([ww[1] for ww in wwlist])
        result = []
        for word, weight in wwlist:
            weight = weight / sumw
            result.append((word, weight))
        return result

    def __stats_tf(self, word_dict, wordtagstr, coef=1.0):
        for wotstr in wordtagstr.split():
            wot = wotstr.split('/')
            weight = 0
            if wot[1] == 'n':
                weight += 2
            elif wot[1] == 'v':
                weight += 1
            else:
                continue
            try:
                word_dict[wot[0]] += coef * weight
            except:
                word_dict[wot[0]] = coef * weight

    def seg_text(self, text):
        raise NotImplementedError()
コード例 #2
0
ファイル: wordweight.py プロジェクト: Calvin-he/docclustering
class WordWeightEvaluation:
    def __init__(self, kw_num=20, worddfdir = '../data/worddf'):
        self.worddf = WordDF('r', worddfdir)
        self.docsize = self.worddf.doc_size()
        self.kwnum = kw_num

    def close(self):
        self.worddf.close()

    def extract_kw(self, title, content, seg_text=False):
        """ 
        给出一篇文本的标题和内容,输出关键词及相应权重的列表。
        如果seg_text为True,说明还没有分过词,需要分词处理
        """
        if seg_text:
            if title:
                title = self.seg_text(title)
            content = self.seg_text(content)

        word_dict = dict()
        if title:
            self.__stats_tf(word_dict, title, coef=2.0)
        self.__stats_tf(word_dict, content, coef=1.0)

        #sum_tf = sum(word_dict.itervalues())
        for w,v in word_dict.iteritems():
            df = self.worddf.df(w) + 1.0
            idf = math.log(self.docsize/df, 2)
            ntf = v * idf
            word_dict[w] = ntf
        wwlist = heapq.nlargest(self.kwnum, word_dict.iteritems(), key=lambda x:x[1])
        sumw = sum([ww[1] for ww in wwlist])
        result = []
        for word,weight in wwlist:
            weight = weight / sumw
            result.append((word,weight))
        return result

    def __stats_tf(self, word_dict, wordtagstr, coef=1.0):
        for wotstr in wordtagstr.split():
            wot = wotstr.split('/')
            weight = 0
            if wot[1] == 'n':
                weight += 2
            elif wot[1] == 'v':
                weight += 1
            else: continue
            try:
                word_dict[wot[0]] += coef*weight
            except:
                word_dict[wot[0]] = coef*weight
        

    def seg_text(self, text):
        raise NotImplementedError()
コード例 #3
0
ファイル: wordweight.py プロジェクト: piaofu110/docclustering
 def __init__(self, kw_num=20, worddfdir='../data/worddf'):
     self.worddf = WordDF('r', worddfdir)
     self.docsize = self.worddf.doc_size()
     self.kwnum = kw_num
コード例 #4
0
ファイル: wordweight.py プロジェクト: Calvin-he/docclustering
 def __init__(self, kw_num=20, worddfdir = '../data/worddf'):
     self.worddf = WordDF('r', worddfdir)
     self.docsize = self.worddf.doc_size()
     self.kwnum = kw_num