コード例 #1
0
 def __init__(self, artical_directory=None):
     self.docs_words = []
     self.artical_directory = artical_directory
     self.artical_handler = ArticalHandler(
         artical_directory=self.artical_directory)
     self.word_segmenter = WordSegmenter()
     self.get_docs_words()
コード例 #2
0
class WordSegmenterTest(unittest.TestCase):

    def setUp(self):
        self.word_segmenter = WordSegmenter()
        self.word_segmenter.load('data')

    def call_segment(self, text):
        for word in self.word_segmenter.segment(text):
            print word + '/\t',
        print ''

    def test_segment(self):
        fp = open('core/testdata/document.dat', 'rb')
        for text in fp.readlines():
            self.call_segment(text.strip())
        fp.close()

    def call_segment_with_pos(self, text):
        for word in self.word_segmenter.segment_with_pos(text):
            print word + '/\t',
        print ''

    def test_segment_with_pos(self):
        fp = open('core/testdata/document.dat', 'rb')
        for text in fp.readlines():
            self.call_segment(text.strip())
        fp.close()
コード例 #3
0
class SimilarityCalculator(object):
    def __init__(self, artical_directory=None):
        self.docs_words = []
        self.artical_directory = artical_directory
        self.artical_handler = ArticalHandler(
            artical_directory=self.artical_directory)
        self.word_segmenter = WordSegmenter()
        self.get_docs_words()

    # 读取所有文档的分词词组列表(从本地硬盘读出(开发时),从redis数据库读出(部署时))
    def get_docs_words(self):
        self.docs_words = []
        for artical in self.artical_handler.get_artical_generators():
            artical_separated = self.word_segmenter.separate_artical_for_calculate(
                artical)  # dev
            # artical_separated = self.word_segmenter.read_from_redis_for_calculate()           # prod
            self.docs_words.append(artical_separated)
        return self.docs_words

    # 建立语料特征索引字典
    def get_docs_corpus(self):
        dictionary = corpora.Dictionary(self.docs_words)
        for doc_words in self.docs_words:
            yield dictionary.doc2bow(doc_words)

    # 生成所有文档的TFIDF模型
    def get_docs_TFIDF_model(self):
        TFIDF_model = models.TfidfModel(self.get_docs_corpus())
        return TFIDF_model

    # 生成所有文档的LSI模型(未启用)
    def get_docs_LSI_model(self):
        LSI_model = models.LsiModel(corpus=self.get_docs_corpus(),
                                    id2word=corpora.Dictionary(
                                        self.docs_words),
                                    num_topics=2)
        return LSI_model

    # 得到每篇doc对应和其他doc的TF-IDF相似度
    def get_docs_TFIDF_similarities(self):
        docs_corpus = self.get_docs_corpus
        TFIDF_model = self.get_docs_TFIDF_model()
        # 初始化一个相似度计算的对象[可用save()序列化保存到本地]
        TFIDF_similarity_calculator = similarities.MatrixSimilarity(
            corpus=list(TFIDF_model[docs_corpus()]))
        for doc_vectors in docs_corpus():
            doc_similarities = list(
                enumerate(
                    TFIDF_similarity_calculator[TFIDF_model[doc_vectors]]))
            yield self.prettify(doc_similarities)

    # 得到每篇doc对应和其他doc的LSI相似度
    def get_docs_LSI_similarities(self):
        pass

    @staticmethod
    def prettify(doc_similarities):
        pretty_doc_similarities = []
        for each_similarity in doc_similarities:
            data = {
                'index': each_similarity[0],
                'similarity': '%.2f' % (each_similarity[1] * 100)
            }
            pretty_doc_similarities.append(data)
        return pretty_doc_similarities
コード例 #4
0
def word_segmenter():
    return WordSegmenter()
コード例 #5
0
 def setUp(self):
     self.word_segmenter = WordSegmenter()
     self.word_segmenter.load('data')