def test_transformer_idf_setter(): X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) orig = TfidfTransformer().fit(X) copy = TfidfTransformer() copy.idf_ = orig.idf_ assert_array_equal( copy.transform(X).toarray(), orig.transform(X).toarray())
def test_transformer_idf_setter(): X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) orig = TfidfTransformer().fit(X) copy = TfidfTransformer() copy.idf_ = orig.idf_ assert_array_equal( copy.transform(X).toarray(), orig.transform(X).toarray())
def tfidf_test(data, tf_vocab, idf_diag): """ input: sentences """ vectorizer = CountVectorizer(vocabulary=tf_vocab) tf = vectorizer.transform(data) # 返回的是稀疏表示 transformer = TfidfTransformer() transformer.idf_ = idf_diag tfidf = transformer.transform(tf) tfidf = tfidf.toarray() return tfidf
def model_forTypeFinal(tags_final): f_open = open('/home/stu/model/new_feature_names1.txt', 'r', encoding='UTF-8') f_text = f_open.read() f_list = eval(f_text) # 將字符串str當成有效的表達式來求值並返回計算結果 file_set = set(f_list) type(f_list) # info_forModel = {} # info_forModel = info_jieba # info_forModel['tags_final'] = tags_final # info_forModel['weight'] = weight tags_final_forModel = tags_final.split("、") tags_setted = list(set(tags_final_forModel) & file_set) x_test = [' '.join(tags_setted)] f_open = open('/home/stu/model/new_vocabulary.txt', 'r', encoding='UTF-8') f_text = f_open.read() vocab = eval(f_text) f_open = open('/home/stu/model/new_idf_all.txt', 'r', encoding='UTF-8') f_text = f_open.read() idf_all = np.asarray(eval(f_text)) count_v2 = CountVectorizer(vocabulary=vocab) counts_test = count_v2.transform(x_test) # print("the shape of test is " + repr(counts_test.shape)) tfidftransformer = TfidfTransformer() tfidftransformer.idf_ = idf_all x_test = tfidftransformer.transform(counts_test) model_path = '/home/stu/model/new_clf.pickle' model = pickle.load(open(model_path, "rb")) y_pred = model.predict(x_test) preds = y_pred.tolist() id2c = id2c_mapping[preds[0]] return id2c