def main(): """ Using the fastText model here to predict licenses using automatic hyperparameter tuning """ os.chdir('../../../all_files_generated') current_dir = os.getcwd() text_files_dir = os.path.join(current_dir, 'text_files') model_pickles_dir = os.path.join(current_dir, 'model_pickles') model_confusion_matrix_dir = os.path.join(current_dir, 'model_confusion_matrix_files') training_validation_file_path = os.path.join(text_files_dir, 'train_validation.txt') test_file_path = os.path.join(text_files_dir, 'test.txt') model_path = os.path.join(model_pickles_dir, 'fasttext.pickle') confusion_matrix_path = os.path.join(model_confusion_matrix_dir, 'fast_text_confusion_matrix.png') try: license_classifier = fasttext.load_model(model_path) print('Model was loaded in successfully!') except ValueError as e: print('fastText model will begin training ...') license_classifier = fasttext.train_supervised(input=training_validation_file_path, autotuneValidationFile=test_file_path, autotuneDuration=60) print('fastText model finished training') print('Saving model ...') license_classifier.save_model(model_path) print('Saved!') print('Starting predictions ...') x_train = [] y_train = [] train_predictions = [] with open(training_validation_file_path, 'r', encoding='utf-8') as train_file: for line in train_file.readlines(): line_array = line.split('__label__') comment_block_text = line_array[0].strip() label = int(line_array[1]) x_train.append(comment_block_text) y_train.append(label) train_predictions.append(int(license_classifier.predict(comment_block_text)[0][0][9:])) x_test = [] test_predictions = [] y_test = [] with open(test_file_path, 'r', encoding='utf-8') as validation_file: for line in validation_file.readlines(): line_array = line.split('__label__') comment_block_text = line_array[0].strip() label = int(line_array[1]) x_test.append(comment_block_text) y_test.append(label) test_predictions.append(int(license_classifier.predict(comment_block_text)[0][0][9:])) print('Predictions complete!') # Training accuracy print("The training accuracy is: ") print(accuracy_score(y_train, train_predictions)) # Test accuracy print("The test accuracy is: ") print(accuracy_score(y_test, test_predictions)) # Classification report print("Classification report") print(classification_report(y_test, test_predictions)) # Confusion Matrix conf_matrix = confusion_matrix(y_test, test_predictions) print(conf_matrix) plt.figure(figsize=(12.8, 6)) sns.heatmap(conf_matrix, annot=True, xticklabels=['not_license', 'license'], yticklabels=['not_license', 'license'], cmap="Blues") plt.ylabel('Predicted') plt.xlabel('Actual') plt.title('Confusion matrix') plt.savefig(confusion_matrix_path) plt.show()
def calibrate(collection, issn_map, is_stratified, sample_data_file=None): if sample_data_file is None: sample_data = sample(collection, issn_map, is_stratified) else: sample_data = f"{PV_MOUNT}{sample_data_file}" download_object("sampling", sample_data_file, sample_data) data = pd.read_json(sample_data, orient="records", lines=True).to_dict(orient='records') #download_object("tmp", sample_data.split('/')[-1], sample_data) logger.debug("len data = " + str(len(data))) logger.debug("len issn_map = " + str(len(issn_map))) for elt in data: if '_id' in elt: del elt['_id'] current_label_text = [] #current_label_text_global = [] for issn_type in ['issn_electronic', 'issn_print']: issn = elt[issn_type] if issn in issn_map: current_label_text += issn_map[issn] current_label_text = list(set(current_label_text)) elt["labels_text"] = current_label_text data_with_label = [e for e in data if len(e['labels_text'])] data_train, data_test = train_test_split(data_with_label, test_size=85000, random_state=0) for data_type in ["train", "test"]: logger.debug(data_type) outfile = {} for f in [ 'title', 'abstract', 'keywords', 'mesh_headings', 'journal_title' ]: outfile[f] = open(f"{PV_MOUNT}{collection}_{data_type}_{f}.txt", "w") outfile[f].close() for f in [ 'title', 'abstract', 'keywords', 'mesh_headings', 'journal_title' ]: outfile[f] = open(f"{PV_MOUNT}{collection}_{data_type}_{f}.txt", "a+") logger.debug(f) if data_type == "train": current_data = data_train else: current_data = data_test for ix, elt in enumerate(current_data): if ix % 100000 == 0: logger.debug(ix) current_words = elt.get(f) if current_words is None: continue if isinstance(current_words, list): current_words = " ".join(current_words) if f == "abstract" and len(current_words.split(" ")) < 20: continue elif f == "title" and len(current_words.split(" ")) < 10: continue elif len(current_words.split(" ")) < 2: continue elif len(current_words) < 5: continue current_words = normalize(current_words) labels = [ "__label__" + label.replace(' ', '_') for label in elt.get('labels_text', []) ] tags = " ".join(labels) newline = current_words + " " + tags + "\n" outfile[f].write(newline) outfile[f].close() for f in [ 'journal_title', 'title', 'abstract', 'keywords', 'mesh_headings' ]: logger.debug("training " + f) model = fasttext.train_supervised( f'{PV_MOUNT}{collection}_train_{f}.txt', wordNgrams=2, minCount=20, loss='ova', epoch=50) model_filename = f"{PV_MOUNT}{collection}_model_{f}_strat{is_stratified}.model" model.save_model(model_filename) upload_object("models", model_filename) test = model.test(f'{PV_MOUNT}{collection}_test_{f}.txt', k=-1, threshold=0.5) precision = test[1] recall = test[2] f1 = 2 * (recall * precision) / (recall + precision) logger.debug(f"precision: {precision}, recall: {recall}, f1: {f1}")
if count<10000: ftrain.write(outline) ftrain.flush() continue elif count<20000: ftest.write() ftest.flush() continue else: break ftrain.close() ftest.close() print("---------dataset done--------") classifier=fasttext.train_supervised("news_fasttext_train.txt",label_prefix="_label_") #训练模型 classifier.save_model("Model.bin") #保存模型 # classifier=fasttext.load_model('Model.bin') #已经有训练好的模型的话,直接加载训练好的模型 print("---------train done----------") #预测,输出准确率 result=classifier.test("news_fasttext_test.txt") print('precision: ',result[1]) print("---------输出各类的统计情况----------") #以下模块可以统计不同分类的结果 labels_right = [] texts = [] with open("news_fasttext_test.txt",encoding="utf-8") as fr: for line in fr: line = str(line.encode("utf-8"), 'utf-8').rstrip()
import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) import fasttext import time import os #训练模型 # path = "fasttext" # os.chdir(path) # 监督训练模型 start = time.clock() model = fasttext.train_supervised( input="train.txt", label_prefix="__label__", lr=0.05, epoch=25, # wordNgrams=2, # 该参数导致准确率降低 bucket=200000, dim=50, loss="softmax" # 可选loss='softmax' ) end = time.clock() print('Running time: %s Seconds' % (end - start)) model.save_model("model_news_fasttext.bin") #load训练好的模型 model = fasttext.load_model('model_news_fasttext.bin') print('训练完成!')
import sys import fasttext as ft from pprint import pprint ''' パラメータ説明 dim, 次元の数 lr, 学習率(1.0に近いほど学習が早いけれど不安定) epoch, 学習回数(デフォルト5 多すぎると過学習) ''' model = ft.train_supervised('test.txt', dim=200, lr=0.5, epoch=10, thread=16) model.save_model("model_filename.bin") #pprint(model.labels) pprint(model.test_label("test.txt")) pprint(model.test('test.txt'))
for train_idx, val_idx in kfold.split(X): train_X, train_y = X[train_idx], y[train_idx] val_X, val_y = X[val_idx], y[val_idx] print(f'DEBUG: train X {len(train_X)} tweets, train y {len(train_y)} tweets') print(f'DEBUG: validation X {len(val_X)} tweets, validation y {len(val_y)} tweets') fasttext_format(train_X, train_y, TRAIN_FORMATTED) print('Training model on train set...') model = ft.train_supervised( \ input=TRAIN_FORMATTED, \ epoch=PARAMS['epoch'], \ word_ngrams=PARAMS['word_ngrams'], \ min_count=PARAMS['min_count'], \ ws=PARAMS['ws'], \ lr=PARAMS['lr'], \ loss=PARAMS['loss'], \ neg=PARAMS['neg'], \ dim=PARAMS['dim']) print('Computing predictions on validation set...') preds = model.predict(list(val_X)) val_preds = convert_preds(preds) print('Computing accuracy...') acc = accuracy(val_preds, val_y) accuracies.append(acc) print(acc) print(np.array(accuracies).mean())
) test.to_csv( DIR + '/corpus_ft.test' , header=False , index=False , sep='\t' ) # Train the Model # Manual Tuning (ie trial and error) model = fasttext.train_supervised( DIR + '/corpus_ft.train' , lr=1.0 , epoch=25 , wordNgrams=3 , bucket=200000 , dim=50 , loss='ova' ) # Test Manual Model results = model.test(DIR + '/corpus_ft.test', k=4, threshold=0.7) print(results) # Train the Model with Auto-Tuning # # Auto-Tunes parameters (gets the best parameters for the above parameters like lr, wordNgrams, etc.) # automodel = fasttext.train_supervised( # DIR + '/corpus_ft.train' # , autotuneValidationFile=DIR + '/corpus_ft.test' # , autotuneDuration=300 # Tune for 5 minutes
import fasttext model = fasttext.train_supervised(input="./oroscopo-data/oroscopo.train", epoch=20, dim=500, wordNgrams=2) model.save_model("model_oroscopo_big.bin") print(model.test("./oroscopo-data/oroscopo.valid"))
def run_on_file(self, input_filename, output_filename, user_id, project_id, label_id=None, pipeline=None, bootstrap_iterations=0, bootstrap_threshold=0.9, run_on_entire_dataset=False): input_filename = os.path.abspath(input_filename) output_filename = os.path.abspath(output_filename) output_folder = os.path.join(os.path.dirname(output_filename), 'results') os.makedirs(output_folder, exist_ok=True) print( 'Running text classification model on input file {}. Results will be saved to {}...' .format(input_filename, output_filename)) print('Reading input file...') if input_filename[-8:] == '.parquet': df = pd.read_parquet(input_filename) else: df = pd.read_csv(input_filename, encoding='latin1') label_field = 'label_id' if 'label_id' in df.columns: df['label'] = df['label_id'] elif 'label' not in df.columns: raise ValueError( "no columns 'label' or 'label_id' exist in input file") df = df[~pd.isnull(df['text'])] df.loc[:, label_field] = df[label_field].apply( lambda x: str(x) if not pd.isnull(x) else x) df.loc[df[label_field] == ' ', label_field] = None if label_id: df_labeled = df[df[label_field] == label_id] df_labeled = pd.concat([ df_labeled, df[df[label_field] != label_id].sample(df_labeled.shape[0]) ]) df_labeled.loc[df_labeled[label_field] != label_id, label_field] = 0 df_labeled = df_labeled[(~pd.isnull(df_labeled[label_field])) & (df_labeled[label_field] != ' ')] else: df_labeled = df[(~pd.isnull(df[label_field]))] print('Pre-processing text and extracting features...') self.set_preprocessor(pipeline) X = self.pre_process(df_labeled, fit=True) if label_field not in df_labeled.columns: raise RuntimeError("column '{}' not found".format(label_field)) else: y = df_labeled[label_field].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) print('Training the model...') self.fit(X_train, y_train) print('Performance on train set:') _, evaluation_text = self.evaluate(X_train, y_train) result = 'Performance on train set: \n' + evaluation_text print('Performance on test set:') _, evaluation_text = self.evaluate(X_test, y_test) result = result + '\nPerformance on test set: \n' + evaluation_text df_gold_labels = df[df['user_id'] == 'gold_label'] y_gold_labels = df_gold_labels[label_field].values if len(y_gold_labels) > 0: X_gold_labels = self.pre_process(df_gold_labels, fit=False) print('Performance on gold labels set:') _, evaluation_text = self.evaluate(X_gold_labels, y_gold_labels) result = result + '\nPerformance on gold labels set: \n' + evaluation_text else: print( 'Gold labels do not exist - skipping the evaluation of model performance on them.' ) if run_on_entire_dataset: print('Running the model on the entire dataset...') columns = ['document_id', label_field, 'user_id', 'prob'] if bootstrap_iterations > 0: print('Bootstrapping...') y_aug = df[label_field].copy() for i in range(bootstrap_iterations + 1): # fitting on labeled examples has_label = ~pd.isna(y_aug) X_labeled = self.pre_process(df.loc[has_label], fit=False) self.fit(X_labeled, y_aug[has_label]) # predict in chunks and (optionally) add bootstrapped labels chunk_size = 10000 n_samples = df.shape[0] for chunk_start in tqdm(range(0, n_samples, chunk_size)): chunk_end = min(n_samples, chunk_start + chunk_size) chunk_df = df.iloc[chunk_start:chunk_end] chunk_df.loc[:, label_field] = None y_chunk = df.iloc[chunk_start:chunk_end][label_field] X_chunk = self.pre_process(chunk_df, fit=False) if i < bootstrap_iterations: print('bootstrap iteration ', i, '/', bootstrap_iterations, ' ', [ x for x in zip( np.unique(y_aug[has_label], return_counts=True)) ]) # no need to re-fit the model, only predict y_chunk_aug = self.bootstrap(X_chunk, y=y_chunk, th=bootstrap_threshold, fit=False) y_aug.iloc[chunk_start:chunk_end] = y_chunk_aug # write to file only in last iteration if i == bootstrap_iterations: chunk_prediction_df = self.get_prediction_df(X_chunk, y=y_chunk) chunk_prediction_df['document_id'] = df['document_id'] chunk_prediction_df['user_id'] = user_id chunk_prediction_df = chunk_prediction_df.rename( {'confidence': 'prob'}, axis=1) chunk_prediction_df[label_field] = chunk_prediction_df[ 'prediction'] chunk_prediction_df[columns].to_csv(output_filename, index=False, header=True) # output_df = pd.DataFrame(columns=columns) # output_df.to_csv(output_filename, index=False, header=True, index_label=False) print('Saving model weights to file...') class_weights = self.important_features class_weights_filename = os.path.join( output_folder, 'ml_logistic_regression_weights_{project_id}.csv'.format( project_id=project_id)) class_weights.to_csv(class_weights_filename, header=True, index=False) print('Saving model to a pickle file...') model_save_filename = os.path.join( output_folder, 'ml_model_{project_id}.pickle'.format(project_id=project_id)) self.save(model_save_filename) print('Saving model results to a text file...') ml_model_results_filename = os.path.join( output_folder, 'ml_model_results_{}.txt'.format(project_id)) with open(ml_model_results_filename, 'wt') as f: f.write(result) y_test_pred = self.predict(X_test) y_test_pred_proba = self.predict_proba(X_test) # # Showing examples of large errors # df_labeled.loc[:, 'y_pred'] = self.predict(X) # df_labeled.loc[:, 'is_error'] = df_labeled['y_pred']!=df_labeled[label_field] # df_labeled.loc[:, 'y_pred_proba'] = np.max(self.predict_proba(X), axis=1) # df_labeled.to_csv(output_filename, index=False, header=True, index_label=False) # Confusion matrix print('Generating confusion matrix...') from src.utils.analyze_model import plot_confusion_matrix fig = plot_confusion_matrix(y_test, y_test_pred, classes=None, normalize=True, title='Normalized confusion matrix - test') filename = os.path.join( output_folder, 'confusion_matrix_test_{}.png'.format(project_id)) fig.savefig(filename) plt.clf() fig = plot_confusion_matrix( y_train, self.predict(X_train), classes=None, normalize=True, title='Normalized confusion matrix - train') filename = os.path.join( output_folder, 'confusion_matrix_train_{}.png'.format(project_id)) fig.savefig(filename) plt.clf() # Precision-recall curve print('Generating the Precision-Recall graph...') try: fig = plot_precision_recall_curve(y_test_pred_proba, y_test) filename = os.path.join( output_folder, 'precision_recall_curve_{}.png'.format(project_id)) fig.savefig(filename) plt.clf() except ValueError as e: print(e) # ROC curve print('Generating ROC curve...') try: fig = plot_roc_curve(y_test_pred_proba, y_test) filename = os.path.join(output_folder, 'roc_curve_{}.png'.format(project_id)) fig.savefig(filename) plt.clf() except ValueError as e: print(e) # Confidence-accuracy graph print('Generating the Confidence-Accuracy graph...') try: fig = plot_confidence_performance(y_test_pred, y_test_pred_proba, y_test) filename = os.path.join( output_folder, 'confidence_accuracy_graph_{}.png'.format(project_id)) fig.savefig(filename) plt.clf() except ValueError as e: print(e) # Confidence Distribution print('Computing distribution of confidence...') try: ax = pd.Series(np.max(y_test_pred_proba, axis=1)).hist(bins=50) plt.xlabel('Confidence') plt.ylabel('Counts') filename = os.path.join( output_folder, 'confidence_distribution_{}.png'.format(project_id)) plt.gcf().savefig(filename) plt.clf() except ValueError as e: print(e) # Generating learning curve print('Generating the learning curve...') from src.utils.analyze_model import plot_learning_curve_cv fig = plot_learning_curve_cv(X, y, estimator=self._model) filename = os.path.join(output_folder, 'learning_curve_{}.png'.format(project_id)) fig.savefig(filename) plt.clf() # Run FastText for text classification df_labeled_train = df_labeled.loc[X_train.index, :] df_labeled_test = df_labeled.loc[X_test.index, :] if RUN_FASTTEXT: try: print('Running FastText model...') import fasttext def write_as_fasttext_format(df, filename): with open(filename, 'wt', encoding='utf-8') as f: _ = [ f.write('{} __label__{}\n'.format( r['text'].lower().replace('\n', ' '), r['label_id'].replace(' ', '_'))) for i, r in df.iterrows() ] write_as_fasttext_format(df_labeled_train, output_folder + '/fasttext_train.txt') write_as_fasttext_format(df_labeled_test, output_folder + '/fasttext_test.txt') classifier = fasttext.train_supervised( output_folder + '/fasttext_train.txt', 'model') fasttext_result = classifier.test(output_folder + '/fasttext_test.txt') fasttext_pred = classifier.predict([ r['text'].lower().replace('\n', ' ') for i, r in df_labeled_test.iterrows() ]) fasttext_pred = [x[0] for x in fasttext_pred] _, evaluation_text = self.evaluate( X=None, y=df_labeled_test['label_id'].str.replace(' ', '_').values, y_pred=fasttext_pred) result += '\nFastText performance on gold labels set: \n' + evaluation_text except Exception as e: print(e) print('Done running the model!') return result
from __future__ import division from __future__ import print_function from __future__ import unicode_literals import os from fasttext import train_supervised def print_results(N, p, r): print("N\t" + str(N)) print("P@{}\t{:.3f}".format(1, p)) print("R@{}\t{:.3f}".format(1, r)) if __name__ == "__main__": train_data = os.path.join(os.getenv("DATADIR", ''), '../input_data/icon.txt') #valid_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.valid') # train_supervised uses the same arguments and defaults as the fastText cli model = train_supervised(input=train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1) #print_results(*model.test(valid_data)) #print_results(*model.test(valid_data)) model.save_model("../model/icon_1307.bin")
help='Predict the class of a sentence') parser.add_argument('-v', action='store', dest='validation', help='Validate the model') parser.add_argument('-k', action='store', dest='K Value', help='Validate the model') results = parser.parse_args() if results.do_training: # model = fasttext.train_supervised(input="stkhelp.train", autotuneValidationFile='stkhelp.test', autotuneDuration=3600) model = fasttext.train_supervised(input="stkhelp.train", wordNgrams=3, autotuneValidationFile='stkhelp.test', autotuneDuration=3600) model.save_model("model_stkhelp.bin") elif results.sentence != None: # model = fasttext.load_model("model_amazon_q.bin") model = fasttext.load_model("model_stkhelp_q.bin") text = clean_text(results.sentence) label = model.predict(text, k=3) str_label = str(label) # print(str_label) # jsonstr = json.dumps(label) col1 = list(label[0]) col2 = label[1].tolist() print(col1) print(col2) pairs = zip(col1, col2)
def predict_results(model, sentence): res = model.predict(preprocessing(sentence)) return res[0][0] if __name__ == "__main__": current_dir = os.getcwd() data_path = os.path.join(current_dir, "data") train_data = "../data_preprocessed/train.txt" valid_data = "../data_preprocessed/test.txt" model = train_supervised( input=train_data, epoch=150, lr=0.05, wordNgrams=2, verbose=2, loss="softmax", label="__lb__", ) print_results(*model.test(valid_data)) summaries, details = test(model, valid_data) print(summaries) print(details) model.save_model("model/ft.li.1701.bin") # model = load_model("model/ft.li.1701.bin") # with open(valid_data, "r") as f: # lines = f.read().split("\n") # lines = [str(model.predict((line))[0]).replace( # "('", "").replace("',)", "") + " " + line for line in lines] # with open("test4.txt", "w") as wf:
def grid_search(train_fn, val_fn, learning_rates, minCounts, epochs, ws, wvs, ndims): best_lr = None best_ndim = None best_minCount = None best_epochs = None best_ws = None best_wv = None highest_f1 = float("-inf") label_counts_val = {} with open(val_fn) as fin: for line in fin: lbls = [ l for l in line.strip().split(" ") if l.startswith('__label__') ] for lbl in lbls: label_counts_val[lbl] = label_counts_val.get(lbl, 0) + 1 label_counts_train = {} with open(train_fn) as fin: for line in fin: lbls = [ l for l in line.strip().split(" ") if l.startswith('__label__') ] for lbl in lbls: label_counts_train[lbl] = label_counts_train.get(lbl, 0) + 1 grid_search_results = [] for lr in learning_rates: for minCount in minCounts: for epoch in epochs: for w in ws: for i in range(0, len(wvs)): wv = wvs[i] ndim = ndims[i] print( "Building fasttext model: {0} lr; {1} dim; {2} min count; {3} epochs. {4} ws. wv: {5}." .format(lr, ndim, minCount, epoch, w, wv)) # train model model = fasttext.train_supervised( input=train_fn, minCount=minCount, wordNgrams=WORDNGRAMS, pretrainedVectors=wv, lr=lr, epoch=epoch, dim=ndim, ws=w, minn=MINN, maxn=MAXN, thread=MAXTHREADS, loss=LOSS, verbose=VERBOSITY) # val results_by_lbl = model.test_label(val_fn, threshold=0.5, k=-1) f1_scores, support = zip( *[(res['f1score'], label_counts_val[lbl]) for lbl, res in results_by_lbl.items() if lbl in label_counts_val]) macro_f1 = np.average(f1_scores) micro_f1 = np.average(f1_scores, weights=support) f1_avg = np.average([micro_f1, macro_f1]) if f1_avg > highest_f1: best_lr = lr best_ndim = ndim best_minCount = minCount best_epochs = epoch best_ws = w best_wv = wv highest_f1 = f1_avg # train (check overfitting) results_by_lbl = model.test_label(train_fn, threshold=0.5, k=-1) f1_scores, support = zip( *[(res['f1score'], label_counts_train[lbl]) for lbl, res in results_by_lbl.items() if lbl in label_counts_train]) tr_macro_f1 = np.average(f1_scores) tr_micro_f1 = np.average(f1_scores, weights=support) print( "{0:.3f} micro f1. {1:.3f} macro f1. {2:.3f} train micro f1. {3:.3f} train macro f1" .format(micro_f1, macro_f1, tr_micro_f1, tr_macro_f1)) grid_search_results.append({ 'lr': lr, 'ndim': ndim, 'minCount': minCount, 'epoch': epoch, 'ws': w, 'val_micro_f1': micro_f1, 'val_macro_f1': macro_f1, 'tra_micro_f1': tr_micro_f1, 'tra_macro_f1': tr_macro_f1, 'wv': wv }) print("\n==== Grid Search Results====\n") print( pd.DataFrame(grid_search_results)[[ 'lr', 'ndim', 'minCount', 'epoch', 'ws', 'val_micro_f1', 'tra_micro_f1', 'val_macro_f1', 'tra_macro_f1', 'wv' ]]) print( "\nBest: {0} lr; {1} dim; {2} min count; {3} epochs; {4} ws; {5} wv\n". format(best_lr, best_ndim, best_minCount, best_epochs, best_ws, best_wv)) return best_lr, best_ndim, best_minCount, best_epochs, best_ws, best_wv
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--training_data", default= "/home/isaacj/fastText/drafttopic/wikitext/enwiki.balanced_article_sample.w_article_text_50413_train_data.txt" ) parser.add_argument( "--val_data", default= "/home/isaacj/fastText/drafttopic/wikitext/enwiki.balanced_article_sample.w_article_text_6301_val_data.txt" ) parser.add_argument( "--test_data", default= "/home/isaacj/fastText/drafttopic/wikitext/enwiki.balanced_article_sample.w_article_text_6303_test_data.txt" ) parser.add_argument("--false_negatives_fn") parser.add_argument("--output_model") parser.add_argument( "--word_vectors", nargs="+", default=[ '/home/isaacj/fastText/drafttopic/wvs/enwiki.vectors.20191201.skipgram_50.300k.vec' ], type=str) parser.add_argument("--learning_rates", nargs="+", default=[0.1], type=float) parser.add_argument("--minCounts", nargs="+", default=[3], type=int) parser.add_argument("--epochs", nargs="+", default=[25], type=int) parser.add_argument("--ws", nargs="+", default=[20], type=int) parser.add_argument("--ndims", nargs="+", default=[50], type=int) args = parser.parse_args() if args.val_data and len(args.learning_rates + args.minCounts + args.epochs + args.ws + args.ndims) > 5: lr, ndim, minCount, epochs, ws, wv = grid_search( args.training_data, args.val_data, args.learning_rates, args.minCounts, args.epochs, args.ws, args.word_vectors, args.ndims) else: lr = args.learning_rates[0] minCount = args.minCounts[0] epochs = args.epochs[0] ws = args.ws[0] wv = args.word_vectors[0] ndim = args.ndims[0] print( "Building fasttext model: {0} lr; {1} min count; {2} epochs; {3} ws; wv: {4}" .format(lr, minCount, epochs, ws, wv)) model = fasttext.train_supervised(input=args.training_data, minCount=minCount, wordNgrams=WORDNGRAMS, lr=lr, epoch=epochs, pretrainedVectors=wv, ws=ws, dim=ndim, minn=MINN, maxn=MAXN, thread=MAXTHREADS, loss=LOSS, verbose=VERBOSITY) if args.output_model: print("Dumping fasttext model to {0}".format(args.output_model)) model.save_model(args.output_model) if args.test_data: # build statistics dataframe for printing print("==== test statistics ====") lbl_statistics = {} toplevel_statistics = {} threshold = 0.5 all_lbls = model.get_labels() for lbl in all_lbls: lbl_statistics[lbl] = { 'n': 0, 'FP': 0, 'TP': 0, 'FN': 0, 'TN': 0, 'true': [], 'pred': [] } toplevel_statistics[ft_to_toplevel(lbl)] = { 'n': 0, 'FP': 0, 'TP': 0, 'FN': 0, 'TN': 0 } with open(args.test_data, 'r') as fin: for line_no, datapoint in enumerate(fin): _, topics = model.get_line(datapoint.strip()) prediction = model.predict(datapoint.strip(), k=-1) predicted_labels = [] for idx in range(len(prediction[0])): prob = prediction[1][idx] lbl = prediction[0][idx] lbl_statistics[lbl]['true'].append(int(lbl in topics)) lbl_statistics[lbl]['pred'].append(prob) if prob > threshold: predicted_labels.append(lbl) for lbl in all_lbls: if lbl in topics and lbl in predicted_labels: lbl_statistics[lbl]['n'] += 1 lbl_statistics[lbl]['TP'] += 1 elif lbl in topics: lbl_statistics[lbl]['n'] += 1 lbl_statistics[lbl]['FN'] += 1 elif lbl in predicted_labels: lbl_statistics[lbl]['FP'] += 1 else: lbl_statistics[lbl]['TN'] += 1 toplevel_topics = [ft_to_toplevel(l) for l in topics] toplevel_predictions = [ ft_to_toplevel(l) for l in predicted_labels ] for lbl in toplevel_statistics: if lbl in toplevel_topics and lbl in toplevel_predictions: toplevel_statistics[lbl]['n'] += 1 toplevel_statistics[lbl]['TP'] += 1 elif lbl in toplevel_topics: toplevel_statistics[lbl]['n'] += 1 toplevel_statistics[lbl]['FN'] += 1 elif lbl in toplevel_predictions: toplevel_statistics[lbl]['FP'] += 1 else: toplevel_statistics[lbl]['TN'] += 1 for lbl in all_lbls: s = lbl_statistics[lbl] fpr, tpr, _ = roc_curve(s['true'], s['pred']) s['pr-auc'] = auc(fpr, tpr) s['avg_pre'] = average_precision_score(s['true'], s['pred']) try: s['precision'] = s['TP'] / (s['TP'] + s['FP']) except ZeroDivisionError: s['precision'] = 0 try: s['recall'] = s['TP'] / (s['TP'] + s['FN']) except ZeroDivisionError: s['recall'] = 0 try: s['f1'] = 2 * (s['precision'] * s['recall']) / (s['precision'] + s['recall']) except ZeroDivisionError: s['f1'] = 0 for lbl in toplevel_statistics: s = toplevel_statistics[lbl] try: s['precision'] = s['TP'] / (s['TP'] + s['FP']) except ZeroDivisionError: s['precision'] = 0 try: s['recall'] = s['TP'] / (s['TP'] + s['FN']) except ZeroDivisionError: s['recall'] = 0 try: s['f1'] = 2 * (s['precision'] * s['recall']) / (s['precision'] + s['recall']) except ZeroDivisionError: s['f1'] = 0 print("\n=== Mid Level Categories ===") mlc_statistics = pd.DataFrame(lbl_statistics).T mlc_statistics['mid-level-category'] = [ s.replace('__label__', '').replace('_', ' ') for s in mlc_statistics.index ] mlc_statistics.set_index('mid-level-category', inplace=True) mlc_statistics[''] = '-->' mlc_statistics = mlc_statistics[[ 'n', '', 'TP', 'FP', 'TN', 'FN', 'precision', 'recall', 'f1', 'pr-auc', 'avg_pre' ]] with pd.option_context('display.max_rows', None): print(mlc_statistics) print("\nPrecision: {0:.3f} micro; {1:.3f} macro".format( np.average(mlc_statistics['precision'], weights=mlc_statistics['n']), np.mean(mlc_statistics['precision']))) print("Recall: {0:.3f} micro; {1:.3f} macro".format( np.average(mlc_statistics['recall'], weights=mlc_statistics['n']), np.mean(mlc_statistics['recall']))) print("F1: {0:.3f} micro; {1:.3f} macro".format( np.average(mlc_statistics['f1'], weights=mlc_statistics['n']), np.mean(mlc_statistics['f1']))) print("PR-AUC: {0:.3f} micro; {1:.3f} macro".format( np.average(mlc_statistics['pr-auc'], weights=mlc_statistics['n']), np.mean(mlc_statistics['pr-auc']))) print("Avg pre.: {0:.3f} micro; {1:.3f} macro".format( np.average(mlc_statistics['avg_pre'], weights=mlc_statistics['n']), np.mean(mlc_statistics['avg_pre']))) print("\n=== Top Level Categories ===") tlc_statistics = pd.DataFrame(toplevel_statistics).T tlc_statistics.index.name = 'top-level-category' tlc_statistics[''] = '-->' tlc_statistics = tlc_statistics[[ 'n', '', 'TP', 'FP', 'TN', 'FN', 'precision', 'recall', 'f1' ]] print(tlc_statistics) print("\nPrecision: {0:.3f} micro; {1:.3f} macro".format( np.average(tlc_statistics['precision'], weights=tlc_statistics['n']), np.mean(tlc_statistics['precision']))) print("Recall: {0:.3f} micro; {1:.3f} macro".format( np.average(tlc_statistics['recall'], weights=tlc_statistics['n']), np.mean(tlc_statistics['recall']))) print("F1: {0:.3f} micro; {1:.3f} macro".format( np.average(tlc_statistics['f1'], weights=tlc_statistics['n']), np.mean(tlc_statistics['f1']))) if args.false_negatives_fn: num_examples_per_label = 10 false_negatives = {} for lbl in all_lbls: false_negatives[lbl] = [] with open(args.test_data, 'r') as fin_data: with open(args.test_data.replace('data.txt', 'meta.txt'), 'r') as fin_metadata: for line_no, datapoint in enumerate(fin_data): claims, topics = model.get_line(datapoint.strip()) metadata = next(fin_metadata) prediction = model.predict(datapoint.strip(), k=-1) predicted_labels = [ l for idx, l in enumerate(prediction[0]) if prediction[1][idx] > threshold ] for lbl in topics: if lbl not in predicted_labels: false_negatives[lbl].append('{0}\t{1}'.format( lbl, metadata)) with open(args.false_negatives_fn, 'w') as fout: for lbl in false_negatives: num_examples = min(len(false_negatives[lbl]), num_examples_per_label) random_examples = np.random.choice( false_negatives[lbl], num_examples, replace=False) for ex in random_examples: fout.write(ex)
def build_model(): start = time.time() model = fasttext.train_supervised('train.txt') print("{0:-^30}".format("模型训练")) print("elapse time: %.3fs" % (time.time() - start)) model.save_model("fasttext_model.bin")
test+=line count+=1 dosya=open('train1.txt','w', encoding="utf-8") dosya.write(train) dosya.close() dosya=open('test1.txt','w', encoding="utf-8") dosya.write(test) dosya.close() print(k) print(s) ayir('ortaknew.csv',20) model = fasttext.train_supervised(input='train1.txt', epoch=25,lr=0.1, wordNgrams=2, loss='hs',dim=100) model.predict("çok iyi",k=3) from mlxtend.plotting import plot_confusion_matrix rounded_pred = model.predict(s, k=1) print(rounded_pred[0][1]) print(confusion_matrix(k,rounded_pred[0])) print(plot_confusion_matrix(conf_mat=confusion_matrix(k,rounded_pred[0]))) total_acc=0 for i in range(len(rounded_pred[0])): if rounded_pred[0][i][0]==true_labels[i]: total_acc+=1
train = pd.read_csv(data_path+"data.txt", header=0, sep='\r\n', engine='python') ts = train.shape df = pd.DataFrame(train) new_train = df.reindex(np.random.permutation(df.index)) # 按9:1比例切分为2个文件 indice_90_percent = int((ts[0]/100.0)* 90) new_train[indice_90_percent:].to_csv(data_path+'test.txt',index=False) new_train[:indice_90_percent].to_csv(data_path+'train.txt',index=False) # 开始训练 model = fasttext.train_supervised(input=data_path+"train.txt", epoch=20, lr=1.0, wordNgrams=2, bucket=200000, dim=50, loss='hs') # 保存训练好的模型 model.save_model(data_path+"model.bin") # 优化模型 model.quantize(input=data_path+ 'model.bin', retrain=False) model.save_model(data_path+"model.ftz") # 测试单个词 print(model.predict("保暖 内衣", k=3)) # 测试集
import fasttext print("Training model ...") model = fasttext.train_supervised(input="cooking.train", epoch=25, lr=1.0) print("Saving model ...") model.save_model("model_cooking.bin") print("Validating model ...") result = model.test("cooking.valid") print(result) print("Predicting model ...") result = model.predict("Which baking dish is best to bake a banana bread ?") print(result)
import fasttext import os if __name__ == "__main__": hyper_params = { "lr": 0.35, # Learning rate "epoch": 100, # Number of training epochs to train for "wordNgrams": 3, # Number of word n-grams to consider during training "dim": 155, # Size of word vectors "ws": 5, # Size of the context window for CBOW or skip-gram "minn": 2, # Min length of char ngram "maxn": 5, # Max length of char ngram "bucket": 2014846, # Number of buckets } training_data_path = 'sst_train.txt' # Train the FastText model model = fasttext.train_supervised(input=training_data_path, **hyper_params) print("FastText model trained with the hyperparameters: \n {}".format(hyper_params)) model.save_model(os.path.join('C:/Users/mehra/OneDrive/Documents/GitHub/73StringsAssignment', "sst.bin")) # Quantize model to reduce space usage model.quantize(input=training_data_path, qnorm=True, retrain=True, cutoff=110539) model.save_model(os.path.join('C:/Users/mehra/OneDrive/Documents/GitHub/73StringsAssignment', "sst_quantized.ftz"))
def build_classify_model(): model = fasttext.train_supervised(config.classify_corpus_path, epoch=20, wordNgrams=2, minCount=1) model.save_model(config.classify_model_path)
from gensim.models import FastText, LdaMulticore import gensim import re import pymorphy2 from gensim.models import KeyedVectors from gensim.test.utils import datapath import fasttext # Получаем экземпляр анализатора (10-20мб) morph = pymorphy2.MorphAnalyzer() corpus_file = datapath('lee_background.cor') model = fasttext.train_supervised('train-comedies-horrors.txt'); model.save_model("comedies-horrors-model.bin") result = model.predict('девушка заброшенный дом призрак') print('test');
import fasttext import os import json model = fasttext.train_supervised( input="data/flair_data2/train/combined.csv", epoch=25, lr=0.5, wordNgrams=2, bucket=200000, dim=50, loss="ova", ) model.save_model("fastText_models/fastText_combined.bin") # model = fasttext.load_model('fastText_models/fastText_combined.bin') scores = dict() input_folder_path = "data/flair_data2/dev/" for filename in os.listdir(input_folder_path): if filename.endswith(".csv"): score = model.test_label(os.path.join(input_folder_path, filename)) score["micro-avaraging"] = model.test(os.path.join(input_folder_path, filename)) scores[filename] = score with open("fastText_clf_outputs/prfs1.txt", "w") as jsonfile: json.dump(scores, jsonfile, indent=2)
FOLDER = "fasttext_tool/" def saveInfoToFile(row, output): output.write("__label__{} {}\n".format(row['polarity'], str(row['text']))) return "" def adjustForm(dataSet, fileName): print("Transforming...") with open('{}{}'.format(FOLDER, fileName), 'w+') as output: dataSet.apply(lambda x: saveInfoToFile(x, output), axis=1) if __name__ == "__main__": dataReader = DataReader() evaluator = Evaluator() if not "data.train" in os.listdir(FOLDER): dataSet = dataReader.read_data_set() adjustForm(dataSet, "data.train") if not "data.test" in os.listdir(FOLDER): testSet = dataReader.read_test_set() adjustForm(testSet, "data.test") if not "model.bin" in os.listdir(FOLDER): model = ft.train_supervised(input=FOLDER + "data.train") model.save_model(FOLDER + "model.bin") else: model = ft.load_model(FOLDER + "model.bin") (_, precision, recall) = model.test(FOLDER + "data.test") metrics = {'precision': precision, 'recall': recall, 'fscore': evaluator.calculate_fscore(precision, recall)} metrics_str = evaluator.getString(metrics) with open(FOLDER + "results.txt", 'w') as output: output.write(metrics_str) print(metrics_str)
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer data_all['Question']=data_all['Question Sentence'].apply(lambda x: " ".join(jieba.cut(x))) import fasttext from sklearn.metrics import f1_score # A data_df=data_all[['Question','category_A']].head(5000) data_df['label_ft'] = '__label__' + data_train['category_A'].head(5000).astype(str) data_df[['Question','label_ft']].to_csv('train.csv', index=None, header=None, sep='\t') model = fasttext.train_supervised('train.csv', lr=0.05, wordNgrams=2, verbose=2, minCount=1, epoch=500, loss="hs") val_pred_A = [model.predict(x)[0][0].split('__')[-1] for x in data_all['Question'][5000:]] sub['category_A']=val_pred_A print(sub['category_A'].value_counts()/3000) print(data_train['category_A'].value_counts()/5000) #B data_df=data_all[['Question','category_B']].head(5000) data_df['label_ft'] = '__label__' + data_train['category_B'].head(5000).astype(str) data_df[['Question','label_ft']].to_csv('train.csv', index=None, header=None, sep='\t') #loss function {ns, hs, softmax, ova}
import fasttext import pandas as pd #data path path = ".\\segdata\\segData.csv" #data = pd.read_csv(path, encoding='UTF-8') #data.iloc[0:int(len(data)*0.8)].to_csv('.\\segdata\\train.txt', header=None, index=None, encoding='utf-8-sig', mode='w') #data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('.\\segdata\\test.txt', header=None, index=None, encoding='utf-8-sig', mode='w') #train_data = ".\\segdata\\train.txt" #test_data = ".\\segdata\\test.txt" #train model = fasttext.train_supervised(path) #model = fasttext.train_unsupervised(path, model='cbow') #model = fasttext.train_unsupervised(path, model='skipgram') #save model model.save_model(".\\model\\model_news.bin")
index = np.argmax(pred[1]) label = int(pred[0][index][-1]) return label def get_proba(pred): pred_dic = {} pred_dic[pred[0][0]] = pred[1][0] pred_dic[pred[0][1]] = pred[1][1] return pred_dic['__label__1'] print('------------------开始训练模型--------------------') model = fasttext.train_supervised(input="d:/train_semantic.txt", lr=0.1, epoch=100, wordNgrams=3, dim=300) print('------------------模型训练结束--------------------') test_pred = [] for i in range(len(test_data)): r = model.predict(" ".join(test_data[i]), k=2) test_pred.append(get_label(r)) acc = accuracy_score(test_pred, test_label) precision = precision_score(test_pred, test_label) recall = recall_score(test_pred, test_label) f1 = f1_score(test_pred, test_label) print("准确率:" + str(acc) + "\n")
def train(): classifier = fasttext.train_supervised(input="data_for_fasttext_train.txt", epoch=20) classifier.save_model('fasttext_classifier.bin') return classifier
train_df = pd.read_csv( '/Users/yowasa/Documents/天池入门NLP - 新闻文本分类/train_set.csv', sep='\t', nrows=15000) train_df['label_ft'] = '__label__' + train_df['label'].astype(str) train_df[['text', 'label_ft']].iloc[:-5000].to_csv('train.csv', index=None, header=None, sep='\t') model = fasttext.train_supervised('train.csv', lr=1, wordNgrams=3, dim=500, verbose=2, minCount=1, epoch=25, loss="softmax") val_pred = [ model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[-5000:]['text'] ] print( f1_score(train_df['label'].values[-5000:].astype(str), val_pred, average='macro'))
import random import os import fasttext as ft model = ft.train_supervised(input="__in.txt", epoch=500, lr=0.7) model.save_model("wikihow.model") results = model.test("__out.txt") print(results)
classification_report(test_Y, pred_Y, target_names=mlb.classes_)) print("accuracy score: ", str(accuracy_score(test_Y, pred_Y))) #report_df.to_csv('Data/fast/preds/trec_train12_classification_report.csv', index=True) # uncomment to generate the report from sklearn.metrics import jaccard_similarity_score from sklearn.metrics import hamming_loss jac_score = jaccard_similarity_score(test_Y, pred_Y) loss = hamming_loss(test_Y, pred_Y) print(jac_score, loss) if __name__ == '__main__': ft = FastText() ft.prepare_dataset() ft.prepare_train_test_val() ft.prepare_testData() model = fasttext.train_supervised( input='Data/pTrec.train.txt', autotuneValidationFile='Data/pTrec.val.txt', autotunePredictions=-1, autotuneDuration=1200) model.save_model('TRECmodel_autotune.ftz') ## -- optional --- #model = fasttext.load_model('TRECmodel_autotune.ftz') #print(model.test('Data/fast/papertrec_tweets_test.txt', k= -1)) ## --------------- ## run the command from the terminal to generate prediction files using the generated model - change the file names according to the dataset # ./fastText-0.9.1/fasttext predict TRECmodel_autotune.ftz Data/fast/papertrec_tweets_test.txt -1 0.2 > Data/fast/paper/prediction_results/on_trec_test.txt ## -- after running the command on the terminal, run the following two lines of code #ft.prepare_prediction_file() #ft.generate_classification_report()