def analyse_doc(self, testFile): #定义清洗分词后的文档名,并进行清洗分词预处理 targetfilename1 = testFile.split('.')[0] + '_cut_split.txt' preprocess.preprocess_doc(testFile, targetfilename1) #定义去停词后的文档名,并进行去停词处理 targetfilename2 = targetfilename1.split('.')[0] + '_stop.txt' stopwords.stopWord_doc(targetfilename1, targetfilename2) #重新整理句子,并进行预测 sentences = [] for line in open(targetfilename2, "r"): sentences.append(line) ans = self.model.predict(sentences) return ans
def test_doc(self, testFile, category=None, process=True): ''' :param trainFile: 进行测试的文档 :param category: 该文档所属类别,从cate_dict中选择 :param process: 该输入文档是否需要预处理 :return: 测试结果 ''' if process == True: #定义清洗分词后的文档名,并进行清洗分词预处理 targetfilename1 = testFile.split('.')[0] + '_cut_split.txt' preprocess.preprocess_doc(testFile, targetfilename1) # 定义去停词后的文档名,并进行去停词处理 targetfilename2 = targetfilename1.split('.')[0] + '_stop.txt' stopwords.stopWord_doc(targetfilename1, targetfilename2) #对预处理后的文档加上标签 testFile = targetfilename1.split('.')[0] + '_labeled.txt' prepare_data.prepare_data(targetfilename2, testFile, category) #测试 ans = self.model.test(testFile) return ans
def train(self, trainFile, category, process=True): ''' :param trainFile: 进行训练的文档 :param category: 该文档所属类别,从cate_dict中选择 :param process: 该输入文档是否需要预处理 :return: ''' if process == True: #定义清洗分词后的文档名,并进行清洗分词预处理 targetfilename1 = trainFile.split('.')[0] + '_cut_split.txt' preprocess.preprocess_doc(trainFile, targetfilename1) # 定义去停词后的文档名,并进行去停词处理 targetfilename2 = targetfilename1.split('.')[0] + '_stop.txt' stopwords.stopWord_doc(targetfilename1, targetfilename2) #对预处理后的文档加上标签 trainFile = targetfilename1.split('.')[0] + '_labeled.txt' prepare_data.prepare_data(targetfilename2, trainFile, category) #训练 classifier = fasttext.supervised(trainFile, 'model/classifier2.model', label_prefix='__label__')
def categorize(self, raw_doc): # preprocess and transform doc = preprocess_doc(raw_doc[0], self.stop_words) doc_input = embedding_lookup([doc], self.vectorizer.w2v_embeddings, self.seq_length, self.embed_size) with tf.Session(graph=self.graph) as session: #session.run(tf.global_variables_initializer()) #self.saver = tf.train.import_meta_graph( # paths.checkpoint + "/" + self.model.name + "-" + str(self.num_epochs) + ".meta", clear_devices=True) self.saver.restore( session, paths.checkpoint + "/" + self.model.name + "-" + str(self.num_epochs)) # run the network for each document #predicted_labels = list() #for doc_input in doc_inputs: # [predicted_label] = session.run([self.model.prediction], {self.model.inputs: doc_input}) # predicted_labels.append(predicted_label[0]) [predicted_label] = session.run([self.model.prediction], {self.model.inputs: doc_input}) # return the predicted class return classes[predicted_label[0]]
user_input = input("Make your query:\n") rm_sw = False if input( "Stopwords were removed during training? [Yes/No]: \n").lower( ) == 'no' else True stem = False if input("Stemming was applied during training? [Yes/No]: \n" ).lower() == 'no' else True if rm_sw: filter_funcs.append(pp.remove_stopwords) if stem: filter_funcs.append(pp.stem_text) query = preprocess_doc(string_to_dict(user_input), filter_funcs=filter_funcs)['words'] print(query) alphas_user = input( "Try different starting learning rates (comma separated):\n") alphas = [float(a) for a in alphas_user.split(',')] steps_user = input( "Try differnte number of steps for inference (comma separated): \n") steps = [int(s) for s in steps_user.split(',')] top_k = int(input("How many recommendations? : \n"))