def make_dictionary(datafile, savefile, filter={}):
	dictionary = corpora.Dictionary([lines for lines in myutil.tokenize_file(datafile, filter=filter)]);
	min_count = filter["min_count"] if ("min_count" in filter) else 5;
	max_rate = filter["max_rate"] if ("max_rate" in filter) else 0.3;
	dictionary.filter_extremes(no_below=min_count, no_above=max_rate);
	#print(myutil.pp(dictionary.token2id));
	print(datafile + ": " + str(len(dictionary)) + " tokens");
	dictionary.save_as_text(savefile);
Example #2
0
 def _init_model(self, datafiles):
     corpus = []
     for datafile in datafiles:
         corpus.extend([
             self._dictionary.doc2bow(line)
             for line in myutil.tokenize_file(datafile)
         ])
     return models.TfidfModel(corpus)
Example #3
0
def make_dictionary(datafile, savefile, filter={}):
    dictionary = corpora.Dictionary(
        [lines for lines in myutil.tokenize_file(datafile, filter=filter)])
    min_count = filter["min_count"] if ("min_count" in filter) else 5
    max_rate = filter["max_rate"] if ("max_rate" in filter) else 0.3
    dictionary.filter_extremes(no_below=min_count, no_above=max_rate)
    #print(myutil.pp(dictionary.token2id));
    print(datafile + ": " + str(len(dictionary)) + " tokens")
    dictionary.save_as_text(savefile)
Example #4
0
 def _init_model(self, datafiles):
     corpus = []
     for datafile in datafiles:
         corpus.extend([
             self._dictionary.doc2bow(line)
             for line in myutil.tokenize_file(datafile)
         ])
     return models.LdaModel(corpus=corpus,
                            id2word=self._dictionary,
                            num_topics=self._num_topics)
Example #5
0
def classify_hcluster(datafiles, model, num_disp=-1):
    feature_vecs = []
    lines = []
    for datafile in datafiles:
        for (tokens, line) in myutil.tokenize_file(datafile,
                                                   include_line=True):
            feature_vec = model.to_feature_vec(tokens)
            feature_vecs.append(feature_vec)
            lines.append(line.decode("utf-8"))
    result = linkage(feature_vecs[0:num_disp],
                     metric="chebyshev",
                     method="average")
    #print result;
    dendrogram(result, labels=lines[0:num_disp])
    show()
Example #6
0
def classify_best(datafiles, model, map):
    for datafile in datafiles:
        for (tokens, line) in myutil.tokenize_file(datafile,
                                                   include_line=True):
            feature_vec = model.to_feature_vec(tokens)
            category_candidate = -1
            max = 0
            for i in range(0, len(feature_vec)):
                if (feature_vec[i] > max):
                    max = feature_vec[i]
                    category_candidate = i
            if (category_candidate in map):
                classified_texts = map[category_candidate]
            else:
                classified_texts = []
                map[category_candidate] = classified_texts
            classified_texts.append(line)
Example #7
0
def classify_kmeans(datafiles, model, map, num_categories):
    feature_vecs = []
    lines = []
    for datafile in datafiles:
        for (tokens, line) in myutil.tokenize_file(datafile,
                                                   include_line=True):
            feature_vec = model.to_feature_vec(tokens)
            feature_vecs.append(feature_vec)
            lines.append(line)
    features = np.array(feature_vecs)
    kmeans_model = KMeans(n_clusters=num_categories,
                          random_state=10).fit(features)
    labels = kmeans_model.labels_
    for label, line in zip(labels, lines):
        if (label in map):
            classified_texts = map[label]
        else:
            classified_texts = []
            map[label] = classified_texts
        classified_texts.append(line)
    print("num_categories=%d, dictionary=%s, infile=%s, outfile=%s" %
          (num_categories, options.dict_file, options.infile, options.outfile))

    # ワード辞書
    dictionary = corpora.Dictionary.load_from_text(options.dict_file)

    # 学習/評価用データセットリスト
    datasets = [options.infile]

    # 特徴抽出モデル
    #model = HDPModel(dictionary, datasets, num_categories);
    #model = LDAModel(dictionary, datasets, num_categories);
    # model = LSIModel(dictionary, datasets, num_categories);
    model = TFIDFModel(dictionary, datasets)
    # model = BoWModel(dictionary)

    # 特徴量(またはトピック内容)の表示
    model.show_topics()
    tokens = myutil.tokenize("ジョブズが最新の携帯モデルを発表する")

    print(len(model.to_feature_vec(tokens)))
    print(model.to_feature_vec(tokens))

    tokens_line = []
    for (tokens, line) in myutil.tokenize_file("data/it1.txt",
                                               include_line=True):
        tokens_line.extend(tokens)

    print(len(model.to_feature_vec(tokens_line)))
    print(model.to_feature_vec(tokens_line))
Example #9
0
 def _init_model(self, datafiles):
     corpus = [];
     for datafile in datafiles:
         corpus.extend([self._dictionary.doc2bow(line) for line in myutil.tokenize_file(datafile)]);
     tfidf = models.TfidfModel(corpus);
     return models.LsiModel(corpus=tfidf[corpus], id2word=self._dictionary, num_topics=self._num_topics);
Example #10
0
    optParser.add_option("-o", dest="outfile", default="work/out.tsv");
    optParser.add_option("-c", dest="num_categories", default="2");
    optParser.add_option("-d", dest="dict_file", default="work/dictionary");
    (options, args) = optParser.parse_args();
    num_categories = int(options.num_categories);
    print("num_categories=%d, dictionary=%s, infile=%s, outfile=%s" % (num_categories, options.dict_file, options.infile, options.outfile));
    
    # ワード辞書
    dictionary = corpora.Dictionary.load_from_text(options.dict_file);
    # 学習/評価用データセットリスト
    datasets = [options.infile]
    model = TFIDFModel(dictionary, datasets)

    feature_vec_list = {}
    tokens_line = []
    for (tokens, line) in myutil.tokenize_file("data/it1.txt", include_line=True):
        tokens_line.extend(tokens)
    feature_vec_list.update({"it1":model.to_feature_vec(tokens_line)})

    tokens_line = []
    for (tokens, line) in myutil.tokenize_file("data/it2.txt", include_line=True):
        tokens_line.extend(tokens)
    feature_vec_list.update({"it2":model.to_feature_vec(tokens_line)})

    tokens_line = []
    for (tokens, line) in myutil.tokenize_file("data/it3.txt", include_line=True):
        tokens_line.extend(tokens)
    feature_vec_list.update({"it3":model.to_feature_vec(tokens_line)})

    tokens_line = []
    for (tokens, line) in myutil.tokenize_file("data/it4.txt", include_line=True):