def fit(self): ''' 模型训练 :return: ''' fastText.train_supervised(self.train_input, label="__label__").save_model(self.output)
def test_fasttext(): import os import codecs from fastText import train_supervised data_dir = "/home/davidyu/stock/scripts/davidyu_stock/scripts/analysis/news_report" train_data = os.path.join(data_dir, "train_text.txt") valid_data = os.path.join(data_dir, "valid_text.txt") model = train_supervised(input=train_data, epoch=50, lr=0.5, wordNgrams=2, dim=100, verbose=2, minCount=1, thread=1) model = train_supervised(input=train_data, epoch=50, lr=0.5, wordNgrams=2, dim=100, verbose=2, minCount=1, thread=1, loss="softmax") print_results(*model.test(valid_data))
def train_model(fname_prefix: str, out_fname: str, label_prefix: str = "__label__", **kwargs): # Train the model import fastText params = { "dim": kwargs.get("dim", 300), "epoch": kwargs.get("epoch", 1000), "wordNgrams": kwargs.get("wordNgrams", 2), "verbose": kwargs.get("verbose", 2), "minCount": kwargs.get("minCount", 15), "minCountLabel": kwargs.get("minCountLabel", 5), "lr": kwargs.get("lr", 0.1), "neg": kwargs.get("neg", 10), "thread": kwargs.get("thread", 16), "loss": kwargs.get("loss", "ns"), "t": kwargs.get("t", 1e-5) } logging.info("Training fastText model", extra={"params": params}) model = fastText.train_supervised(input="%s.train" % fname_prefix, label=label_prefix, **params) logging.info("Writing model to disk", extra={"output_file": out_fname}) model.save_model(out_fname) return model
def train_sup(mode=fasttextConfig.create_data_word): # train supervised model print('start train supervised fasttext model') # init path if mode == fasttextConfig.create_data_word: input_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_train_word_data output_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_word_model test_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_test_word_data elif mode == fasttextConfig.create_data_char: input_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_train_char_data output_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_char_model test_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_test_char_data # init path model = fastText.train_supervised(input=input_path, dim=200, epoch=100, lr=1.0, wordNgrams=3, ws=7, verbose=2, minCount=1, thread=8, loss='hs') print_results(*model.test(test_path)) model.save_model(output_path) print("train sup fasttext finish")
def fit(session, uid, path): labeled_text = dal.get_text_labeled_text(session, uid) # not sure if this is the right prereq for fasttext if len(labeled_text['targets']) < 2: return # preprocess training data one line at a time in order to limit pipe buffer issues preprocessed_data = [preprocess(datum) for datum in labeled_text['data']] # create a new temporary training data file fd, train_path = tempfile.mkstemp() # close the temporary training data file descriptor as we don't need it os.close(fd) # fill the temporary training data file with open(train_path, 'w') as f: for (target, datum) in zip(labeled_text['targets'], preprocessed_data): f.write(label_prefix + target + " " + datum + "\n") # train the fasttext model model = fastText.train_supervised(input=train_path) # compress the fasttext model to save space (disabled for now because it requires at least 256 rows) #model.quantize(input=train_path) # delete the temporary training data file os.unlink(train_path) # serialize the model out to the temporary model file model.save_model(path)
def fasttext_train(input, output): import fastText as ft model = ft.train_supervised( input=input, dim=300, pretrainedVectors='wiki-news-300d-1M-subword.vec') model.save_model(output)
def fit(self, X, y): if not isinstance(X, np.ndarray): X = np.array(X) if not isinstance(y, np.ndarray): y = np.array(y) path = data_to_temp(X, self.label, y) self._num_classes = len(np.unique(y)) self._model = fastText.train_supervised(path, lr=self.lr, dim=self.dim, ws=self.ws, epoch=self.epoch, minCount=self.minCount, minCountLabel=self.minCountLabel, minn=self.minn, maxn=self.maxn, neg=self.neg, wordNgrams=self.wordNgrams, loss=self.loss, bucket=self.bucket, thread=self.thread, lrUpdateRate=self.lrUpdateRate, t=self.t, label=self.label, verbose=self.verbose) os.remove(path) fd, path = tempfile.mkstemp() self._model.save_softmax(path) self.class_embeddings = pd.read_csv( path, skiprows=[0], delimiter=' ').dropna(axis=1) os.remove(path) return self
def trainFastText(self, sents, labls, argsOut): if argsOut.load: # load given model print("Loading fastText model {0} ..".format(argsOut.load)) return load_model(argsOut.load) outFile = "ft_runTrain{0}.txt".format(time.time()) labls = self.roundLabelsRegress(labls) for i in range(0, len(sents)): sents[i] = "__label__{0} {1}".format(labls[i], sents[i]) with open(outFile, mode="w", encoding="utf-8") as file: file.writelines(sents) # Get model model = train_supervised(input=outFile, epoch=argsOut.epochs, dim=argsOut.dim, bucket=10000000, lr=argsOut.lr, wordNgrams=argsOut.wordNgrams, verbose=2, minCount=1) os.remove(outFile) return model
def train(): if FLAGS.word_level: train_file = 'data/train_w.fast' dev_file = 'data/dev_w.fast' model_file = 'temp/ml/fast_model_w.bin' else: train_file = 'data/train_c.fast' dev_file = 'data/dev_c.fast' model_file = 'temp/ml/fast_model_c.bin' model = fastText.train_supervised(input=train_file, dim=100, epoch=15, thread=40, minCount=10, loss='softmax', wordNgrams=2) model.save_model(model_file) vocab_size, p, r = model.test(train_file) f1 = (p * r * 2) * 1.0 / (p + r) print 'Train:vocab_size:%d p:%.5f r:%.5f f1:%.5f' % (vocab_size, p, r, f1) vocab_size, p, r = model.test(dev_file) f1 = (p * r * 2) * 1.0 / (p + r) print 'Dev:vocab_size:%d p:%.5f r:%.5f f1:%.5f' % (vocab_size, p, r, f1)
def fit(self, X, y): ''' Method that trains the fasttext clasifier on the provided dataset. If the file with the serialized model of the classifier trained on the exact same dataset exists, then just load the model, otherwise train the classifier and save model. :param X: the set of instances in the dataset :param y: the set of labels in the dataset :return: None ''' file_identifier = self._get_identifier_for_model_file(X, y) serialized_file = os.path.join(self.folder, file_identifier) if os.path.isfile(serialized_file): print("Loading the model from file " + str(serialized_file)) # file already exists, fit means just loading the model self.model = load_model(serialized_file) else: print("Training the classifier " + str(self.name)) # means that file does not exist, we have to train the model with open(SWAP_FILE, 'w', encoding='utf8') as f: for tweet, label in zip(X, y): f.write(LABEL_IDENTIFIER + str(label) + SEPARATOR + tweet.strip() + '\n') self.model = train_supervised(SWAP_FILE, **self.params) os.remove(SWAP_FILE) # DELETE SWAP FILE AFTER USAGE # lastly, save model. Firstly, create folder for classifier, if it doesn't exist os.makedirs(self.folder, exist_ok=True) self.model.save_model(serialized_file)
def fit(self, X, y): # Fit model if not isinstance(X, np.ndarray): X = np.array(X) if not isinstance(y, np.ndarray): y = np.array(y) path = data_to_temp(X, self.label, y) self._num_classes = len(np.unique(y)) self._model = fastText.train_supervised(path, lr=self.lr, dim=self.dim, ws=self.ws, epoch=self.epoch, minCount=self.minCount, minCountLabel=self.minCountLabel, minn=self.minn, maxn=self.maxn, neg=self.neg, wordNgrams=self.wordNgrams, loss=self.loss, bucket=self.bucket, thread=self.thread, lrUpdateRate=self.lrUpdateRate, t=self.t, label=self.label, verbose=self.verbose) os.remove(path) return self
def _train_tagging_model(self, output, quantization=False): """Train a tagging model using an annotated file and save it to output. Pre-processing of the input file must have been done prior to training. Training file should be utf-8. The model can be quantized to reduce memory usage (but quantization is quite expensive). """ log.info('Start training model.') new_model = fastText.train_supervised(input=self.tempfilename, epoch=self.epoch, lr=self.lr, wordNgrams=self.wordNgrams, minCount=self.minCount, dim=self.dim, loss=self.loss, thread=self.thread, neg=self.neg) log.info('Training model done.') if quantization: log.info('Start quantization.') new_model.quantize(thread=self.thread, retrain=False) log.info('Quantization done.') model_file = '{}/{}.bin'.format(resources_path, output) new_model.save_model(model_file) log.info('Model saved at {}.'.format(model_file)) return model_file
def get_predict(train_path, test_path): """ :param train_path: 处理好的可以用来训练的数据 :param test_path: 处理好的用来预测的无类标数据 """ import fastText classifier = fastText.train_supervised( train_path, lr=0.3, dim=200, epoch=8, minn=1, maxn=4, wordNgrams=2, loss='hs', ) test_data = np.load(test_path) li = [] j = 0 for i in test_data: j += 1 pre = classifier.predict(i[:-1], k=10) result = handle_ft_predict_result( pre) # 每一个句子对应一个二维数组,形如[['价格,' '0' '0.65184933']...] li.append(result) loc_predict_probability = rename(test_path, 'probability_fastT_subOnly.npy') np.save(loc_predict_probability, li)
def process(self, data_file): json_object = self.load_json(data_file) json_object = self.replace_entities(json_object) self.load_answers(json_object) self.create_labeled_questions_file(json_object, "fasttext_data.txt") self.fast_text_model = fastText.train_supervised("fasttext_data.txt", epoch=50)
def train(self, data, labels, dim=100, ng=2, epoch=10): if len(data) != len(labels): raise ValueError("Length of data (" + str(len(data)) + ") does not match length of labels (" + str(len(labels)) + ")") # Convert training data into strings for fastText mapped_report_strs = [] for i in range(len(data)): report_string = data[i].replace("\n", " ") label = " __label__" + str(labels[i]) mapped_report_strs.append(report_string + label) shuffle(mapped_report_strs) # Write strings to a temp file train_path = "./MODEL_TRAIN_TEMP.bin" with open(train_path, 'w') as outfile: for mrs in mapped_report_strs: outfile.write(mrs) outfile.write("\n") # Train fastText model self.model = fastText.train_supervised(train_path, dim=dim, epoch=epoch, thread=4, wordNgrams=ng) # Delete temp file os.remove(train_path)
def fasttext_classify(data, extra_params={}): class_to_predict = 'type' # product importance data[class_to_predict] = data[class_to_predict].map(lambda s : s.replace(" ", "")) data_for_fasttext = data['text'] + ' __label__' + data[class_to_predict] data_for_fasttext = shuffle(data_for_fasttext, random_state=77) num_records = len(data_for_fasttext) data_train = data_for_fasttext[:int(0.85 * num_records)] data_test = data_for_fasttext[int(0.85 * num_records):] data_train.to_csv(TRAIN_PATH, sep='\t', header=0, index=False) data_test.to_csv(TEST_PATH, sep='\t', header=0, index=False) model = fastText.train_supervised(TRAIN_PATH, **extra_params) #model.save_model(MODEL_PATH) print('Training accuracy:') train_accuracy = model.test(TRAIN_PATH) print(train_accuracy[-1]) print('Test accuracy:') test_accuracy = model.test(TEST_PATH) print(test_accuracy[-1]) y_pred = [] y_true = [] for test_item in data_test: test_text, test_label = test_item.split('__label__') y_pred.append(model.predict(test_text)[0]) y_true.append('__label__' + test_label) print('F1 score: ' + str(f1_score(y_true, y_pred, average='weighted'))) return test_accuracy[-1] # accuracy is a tuple
def train_model(self, traverse_list): """ This is used for LR and FT model as each step of the model will output a probability. :param traverse_list: :return: None """ for i, node in enumerate(traverse_list): print("Node key: " + node.key) print("Child keys:") print(node.child.keys()) print("Trained {0} models".format(i)) if len(node.child.keys()) <= 2: continue if node.key == "*": continue if not node.child: continue train_data = self.get_train_data(node) if self.tree_type == "FT": node.classifier = train_supervised(input=train_data) elif self.tree_type == "LR": node.classifier = LogisticRegression(multi_class='multinomial', solver='newton-cg') node.classifier.fit(train_data[0], train_data[1]) elif self.tree_type == "SVM": node.classifier = LinearSVC() node.classifier.fit(train_data[0], train_data[1])
def train(self): master = master_file.MasterFile() data = master.get_training_data() traning = traning_file.TraningFile() traning.reset() hasText = False for uuid in data: for name in data[uuid]: traning.append('__label__' + uuid + ' , ' + app.get_wakati(data[uuid][name]['title'])) hasText = True if hasText is False: return self._model = train_supervised(input=app.get_output_train_text(), epoch=200, lr=0.7, wordNgrams=2, loss="hs", dim=100) with self._lock: self.__print_results( *self._model.test(app.get_output_train_text())) self._model.save_model(app.get_output_fasttext_model())
def trainModel(request): if request.method == 'POST': epoch = int(request.POST['epochValue']) lr = float(request.POST['learningRateValue']) user = request.session['userEmail'] user = user.split('@') """print('epoch:') print(epoch) print('lr') print(lr)""" #Train the model model = fastText.train_supervised( settings.BASE_DIR + '/media/temp/' + user[0] + 'trainData.txt', lr, epoch) #Finished test result = model.test(settings.BASE_DIR + '/media/temp/' + user[0] + 'testData.txt') precision = float(result[1]) recall = float(result[2]) #Compute Accuracy Fmeasure = 2 * ((precision * recall) / (precision + recall)) Fmeasure = "{:.2%}".format(Fmeasure) #print('accuracy%') #print(Fmeasure) #Save model model.save_model(settings.BASE_DIR + '/media/temp/' + user[0] + 'trainedModel.bin') data = {'status': Fmeasure} return JsonResponse(data)
def run(ps, i): lr = random.uniform(0, 1) epoch = round(random.uniform(5, 50)) wordNgrams = round(random.uniform(1, 5)) minCount = round(random.uniform(1, 10)) model = fastText.train_supervised(input=ps[0], lr=lr, epoch=epoch, wordNgrams=wordNgrams, minCount=minCount) # had to do it like this because I want to get the prediction and not just a metric from the model preds = Path(ps[1]).read_text().split('\n') truth = [] output = [] for p in preds: label = p[:10] text = p[11:] truth.append(label) output.append(model.predict(text)[0][0]) rpt = sklearn.metrics.classification_report(truth, output, output_dict=True) rpt['lr'] = lr rpt['epoch'] = epoch rpt['wordNgrams'] = wordNgrams rpt['minCount'] = minCount rpt['kappa'] = sklearn.metrics.cohen_kappa_score(truth, output) Path(ps[0].replace('train.csv', f"{i}_results.json")).write_text(json.dumps(rpt))
def get_predict(train_path, test_path): """ :param train_path: 处理好的可以用来训练的数据 :param test_path: 处理好的用来预测的无类标数据 """ import fastText classifier = fastText.train_supervised( train_path, lr=0.3, dim=10, epoch=10, minn=1, maxn=4, wordNgrams=1, loss='hs', ) test_data = np.load(test_path) li = [] j = 0 for i in test_data: j += 1 pre = classifier.predict(i[:-1], k=10) # print(pre) result = handle_ft_predict_result( pre) # 每一个句子对应一个二维数组,形如[['价格,' '0' '0.65184933']...] # if each_c == 10: # break li.append(result) # print(li) # li_np = np.array(li) loc_predict_probability = rename(test_path, '/probability_fastT_multiLabel.npy') np.save(loc_predict_probability, li)
def fasttext_train_valid(train_X, train_y, model_dir, model_name, **kwargs): """ :param train_X: 训练数据[type:txt] :param train_y: 验证数据[type:txt] :param model_dir: 模型保存的文件夹[type:string] :param model_name: 模型的命名 :parma **kwargs: 模型训练的参数 :return clf, precision, recall: 返回分类器,precision表现和recall表现 """ clf = train_supervised(input=train_X, **kwargs) result = clf.test(train_y) precision = result[1] recall = result[2] if not os.path.exists(model_dir): os.makedirs(model_dir) __model = model_dir + model_name clf.save_model('%s.bin' % __model) __record = { 'train_X': train_X, 'train_y': train_y, 'model_path': __model, 'training_parameter': kwargs, 'precision': precision, 'recall': recall } pprint.pprint(__record, width=1) return clf, precision, recall
def model(): # model = fastText.train_supervised(path + 'train_set.txt', label='__myprefix__',bucket=400000 # ,wordNgrams=2,minCount=3,lr=1,lrUpdateRate=0) model = fastText.train_supervised(path + 't_train_set.txt', label='__myprefix__', bucket=39759 , wordNgrams=3, minCount=3, lr=1, lrUpdateRate=200 ,dim=128) result = model.test(path + 't_test_set.txt') print(result) # model.save_model(model_path + 'model') true_labels = [] all_words = [] f = open(path + 't_test_set.txt', 'r') for line in f: words, labels = model.get_line(line.strip()) if len(labels) == 0: continue all_words.append(" ".join(words)) true_labels += [labels] predictions, _ = model.predict(all_words) n = 0 for i in range(len(true_labels)): if (predictions[i]==true_labels[i]): n+=1 print(n/len(true_labels))
def train(): # (先准备好训练语料) ftrain = 'reviews_fasttext_train.txt' ftest = 'reviews_fasttext_test.txt' # 训练模型 classifier = fastText.train_supervised(ftrain, label="__label__") classifier.save_model("reviews_fasttext.bin")
def retrain(self): if len(self.data ) < self.config['models']['fast_text']['min_train_data']: return self.log.info('Retraining bilinear model, explore_rate = %.2f' % self.explore_rate) train_file = tempfile.NamedTemporaryFile(mode='w') best_tactic = {} for scored_candidate, status in self.data: smt_file = scored_candidate.benchmarks[0].file tactics = get_tactics(scored_candidate.t) if status != ScoredCandidateStatus.SOLVED: continue for i, tactic in enumerate(tactics): fast_text_line = self.encode(tactics[:i]) key = (smt_file, fast_text_line) if key not in best_tactic or best_tactic[key][ 1] > scored_candidate.rlimit: best_tactic[key] = (tactic.s, scored_candidate.rlimit) entry = '__label__%s %s' % (tactic.s, fast_text_line) train_file.write(entry + '\n') for scored_candidate, status in self.data: smt_file = scored_candidate.benchmarks[0].file tactics = get_tactics(scored_candidate.t) if status != ScoredCandidateStatus.SOLVED: continue for i, tactic in enumerate(tactics): params = self.strategy_enum.extract_params([tactic])[0] disc_idx = self.map_to_discretized(tactic.s, params) fast_text_line = self.encode(tactics[:i]) key = (smt_file, fast_text_line) if best_tactic[key][1] == scored_candidate.rlimit: entry = '__label__%s_%d %s' % (tactic.s, disc_idx, fast_text_line) train_file.write(entry + '\n') train_file.flush() os.fsync(train_file.fileno()) self.log.info('Created dataset of %d entries' % len(best_tactic)) self.bilinear_model = fastText.train_supervised( input=train_file.name, epoch=self.config['models']['fast_text']['epoch'], lr=self.config['models']['fast_text']['lr'], wordNgrams=self.config['models']['fast_text']['ngrams'], verbose=self.config['models']['fast_text']['verbose'], minCount=self.config['models']['fast_text']['min_count'], dim=self.config['models']['fast_text']['dim'], )
def train(): model = fastText.train_supervised(path + "/materials/fastTexttrain.input", dim=108, epoch=100, lr=0.001, pretrainedVectors=path + "/materials/umls.embeddings") model.save_model(path + "/materials/fastTexttrain.model") print_results(*model.test(path + "/materials/fastTextval.input"))
def fasttext(input, epoch=25, wordNgrams=2): model = train_supervised(input=input, epoch=epoch, lr=0.9, wordNgrams=wordNgrams, verbose=2, minCount=1) return model
def _create_model(self): self.info('creating fastText model') trainpath = os.path.join(self.datadir, self.TRAIN_FILE) modelpath = os.path.join(self.datadir, self.MODEL_FILE) params = {param: self.FASTTEXT_PARAMS[param](val) for param, val in self.params.items() if param in self.FASTTEXT_PARAMS} self._model = fastText.train_supervised(trainpath, **params) self._model.save_model(modelpath)
def build_supervised_model(data, kwargs): kwargs = default_kwargs(kwargs) with tempfile.NamedTemporaryFile(delete=False) as tmpf: for line in data: line = "__label__" + line.strip() + "\n" tmpf.write(line.encode("UTF-8")) tmpf.flush() model = train_supervised(input=tmpf.name, **kwargs) return model
def main(argv): input_file = argv[0] output_file = argv[1] model = ft.train_supervised(input=input_file, dim=200, epoch=30, lr=0.1, label='__label__', thread=8) model.save_model(output_file)
def sup_test(self): def check( output_local, test_local, n_local, p1_local, r1_local, size_local, lessthan ): test_args = self.default_test_args(output_local, test_local) test_output = self.get_test_output(test_args) self.assertEqual( str(test_output[0]), str(n_local), "N: Want: " + str(n_local) + " Is: " + str(test_output[0]) ) self.assertTrue( float(test_output[1]) >= float(p1_local), "p1: Want: " + str(p1_local) + " Is: " + str(test_output[1]) ) self.assertTrue( float(test_output[2]) >= float(r1_local), "r1: Want: " + str(r1_local) + " Is: " + str(test_output[2]) ) path_size = self.get_path_size(output_local) if lessthan: self.assertTrue( path_size <= size_local, "Size: Want at most: " + str(size_local) + " Is: " + str(path_size) ) else: self.assertTrue( path_size == size_local, "Size: Want: " + str(size_local) + " Is: " + str(path_size) ) train, test, output = self.build_paths( dataset + ".train", dataset + ".test", dataset ) model = train_supervised( input=train, dim=10, lr=lr, wordNgrams=2, minCount=1, bucket=10000000, epoch=5, thread=self.num_thread() ) model.save_model(output) check(output, test, n, p1, r1, size, False) # Exercising model.predict("hello world") model.quantize(input=train, retrain=True, cutoff=100000, qnorm=True) model.save_model(output + ".ftz") # Exercising model.predict("hello world") check(output + ".ftz", test, n, p1_q, r1_q, quant_size, True)
def sup_test(self): def get_path_size(path): path_size = subprocess.check_output(["stat", "-c", "%s", path]).decode('utf-8') path_size = int(path_size) return path_size def check(model, model_filename, test, lessthan, msg_prefix=""): lines, labels = read_labels(test["data"]) predictions = [] for line in lines: pred_label, _ = model.predict(line) predictions.append(pred_label) p1_local_out, r1_local_out = util.test(predictions, labels) self.assertEqual( len(predictions), test["n"], msg_prefix + "N: Want: " + str(test["n"]) + " Is: " + str(len(predictions)) ) self.assertTrue( p1_local_out >= test["p1"], msg_prefix + "p1: Want: " + str(test["p1"]) + " Is: " + str(p1_local_out) ) self.assertTrue( r1_local_out >= test["r1"], msg_prefix + "r1: Want: " + str(test["r1"]) + " Is: " + str(r1_local_out) ) path_size = get_path_size(model_filename) size_msg = str(test["size"]) + " Is: " + str(path_size) if lessthan: self.assertTrue( path_size <= test["size"], msg_prefix + "Size: Want at most: " + size_msg ) else: self.assertTrue( path_size == test["size"], msg_prefix + "Size: Want: " + size_msg ) output = os.path.join(tempfile.mkdtemp(), configuration["dataset"]) model = train_supervised(**configuration["train_args"]) model.save_model(output + ".bin") check(model, output + ".bin", configuration["test"], False) model.quantize(**configuration["quant_args"]) model.save_model(output + ".ftz") check( model, output + ".ftz", configuration["quant_test"], True, "Quant: " )
def do_fasttext(text,stars): #删除通用词 text_cleaned=[] list_stopWords = list(set(stopwords.words('english'))) english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] d = enchant.Dict("en_US") for line in text: # 分词 list_words = word_tokenize(line.lower()) # 去掉标点符号 list_words = [word for word in list_words if word not in english_punctuations] # 实用wordnet删除非常见英文单词 #list_words = [word for word in list_words if wordnet.synsets(word) ] list_words = [word for word in list_words if d.check(word)] # 过滤停止词 filtered_words = [w for w in list_words if not w in list_stopWords] text_cleaned.append( " ".join(filtered_words) ) # 分割训练集和测试集 测试集占20% #x_train, x_test, y_train, y_test = train_test_split(text, stars, test_size=0.2) x_train, x_test, y_train, y_test = train_test_split(text_cleaned, stars, test_size=0.2) # 按照fasttest的要求生成训练数据和测试数据 dump_file(x_train, y_train, "yelp_train.txt") dump_file(x_test, y_test, "yelp_test.txt") model = train_supervised( input="yelp_train.txt", epoch=20, lr=0.6, wordNgrams=2, verbose=2, minCount=1 ) def print_results(N, p, r): print("N\t" + str(N)) print("P@{}\t{:.3f}".format(1, p)) print("R@{}\t{:.3f}".format(1, r)) print_results(*model.test("yelp_test.txt"))
def check(data): third = int(len(data) / 3) train_data = data[:2 * third] valid_data = data[third:] with tempfile.NamedTemporaryFile( delete=False ) as tmpf, tempfile.NamedTemporaryFile(delete=False) as tmpf2: for line in train_data: tmpf.write( ("__label__" + line.strip() + "\n").encode("UTF-8") ) tmpf.flush() for line in valid_data: tmpf2.write( ("__label__" + line.strip() + "\n").encode("UTF-8") ) tmpf2.flush() model = train_supervised(input=tmpf.name, **kwargs) true_labels = [] all_words = [] with open(tmpf2.name, 'r') as fid: for line in fid: if sys.version_info < (3, 0): line = line.decode("UTF-8") if len(line.strip()) == 0: continue words, labels = model.get_line(line.strip()) if len(labels) == 0: continue all_words.append(" ".join(words)) true_labels += [labels] predictions, _ = model.predict(all_words) p, r = util.test(predictions, true_labels) N = len(predictions) Nt, pt, rt = model.test(tmpf2.name) self.assertEqual(N, Nt) self.assertEqual(p, pt) self.assertEqual(r, rt)
def fit(self, X, y): """Fits the classifier Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values. An array of int. Returns ------- self : object Returns self. """ # Check that X and y have correct shape self._validate_x(X) y = self._validate_y(y) # Store the classes seen during fit self.classes_ = unique_labels(y) self.num_classes_ = len(self.classes_) self.class_labels_ = [ '__label__{}'.format(lbl) for lbl in self.classes_] # Dump training set to a fasttext-compatible file temp_trainset_fpath = temp_dataset_fpath() input_col = self._input_col(X) dump_xy_to_fasttext_format(input_col, y, temp_trainset_fpath) # train self.model = train_supervised( input=temp_trainset_fpath, **self.kwargs) # Return the classifier try: os.remove(temp_trainset_fpath) except FileNotFoundError: # pragma: no cover pass return self
def sup_test(self): def get_path_size(path): path_size = subprocess.check_output(["stat", "-c", "%s", path]).decode('utf-8') path_size = int(path_size) return path_size def check(model, model_filename, test, lessthan, msg_prefix=""): N_local_out, p1_local_out, r1_local_out = model.test(test["data"]) self.assertEqual( N_local_out, test["n"], msg_prefix + "N: Want: " + str(test["n"]) + " Is: " + str(N_local_out) ) self.assertTrue( p1_local_out >= test["p1"], msg_prefix + "p1: Want: " + str(test["p1"]) + " Is: " + str(p1_local_out) ) self.assertTrue( r1_local_out >= test["r1"], msg_prefix + "r1: Want: " + str(test["r1"]) + " Is: " + str(r1_local_out) ) path_size = get_path_size(model_filename) size_msg = str(test["size"]) + " Is: " + str(path_size) if lessthan: self.assertTrue( path_size <= test["size"], msg_prefix + "Size: Want at most: " + size_msg ) else: self.assertTrue( path_size == test["size"], msg_prefix + "Size: Want: " + size_msg ) configuration["args"]["input"] = os.path.join( data_dir, configuration["args"]["input"] ) configuration["quant_args"]["input"] = configuration["args"]["input"] configuration["test"]["data"] = os.path.join( data_dir, configuration["test"]["data"] ) configuration["quant_test"]["data"] = configuration["test"]["data"] output = os.path.join(tempfile.mkdtemp(), configuration["dataset"]) print() model = train_supervised(**configuration["args"]) model.save_model(output + ".bin") check( model, output + ".bin", configuration["test"], False, msg_prefix="Supervised: " ) print() model.quantize(**configuration["quant_args"]) model.save_model(output + ".ftz") check( model, output + ".ftz", configuration["quant_test"], True, msg_prefix="Quantized: " )
import os from fastText import train_supervised def print_results(N, p, r): print("N\t" + str(N)) print("P@{}\t{:.3f}".format(1, p)) print("R@{}\t{:.3f}".format(1, r)) if __name__ == "__main__": train_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.train') valid_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.valid') # train_supervised uses the same arguments and defaults as the fastText cli model = train_supervised( input=train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1 ) print_results(*model.test(valid_data)) model = train_supervised( input=train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1, loss="hs" ) print_results(*model.test(valid_data)) model.save_model("cooking.bin") model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000) print_results(*model.test(valid_data)) model.save_model("cooking.ftz")
print("f1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) scores = cross_val_score(clf, x, y, cv = 5,scoring='accuracy') #print scores print("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) if __name__ == '__main__': SogouTCE_kv=load_SogouTCE() #labels=load_url(SogouTCE_kv) x,y=load_selecteddata(SogouTCE_kv) stopwords=load_stopwords() #切割token x=[ [word for word in line.split() if word not in stopwords] for line in x] # 分割训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) #按照fasttest的要求生成训练数据和测试数据 dump_file(x_train,y_train,"../data/sougou_train.txt") dump_file(x_test, y_test, "../data/sougou_test.txt") # train_supervised uses the same arguments and defaults as the fastText cli model = train_supervised( input="../data/sougou_train.txt", epoch=25, lr=0.9, wordNgrams=2, verbose=2, minCount=1 ) print_results(*model.test("../data/sougou_test.txt"))