def fasttext_score( scored_data_output_dir: OutputDirectory(), fasttext_model_dir: InputDirectory() = '.' ): print('=====================================================') print(f'fasttext_model: {Path(fasttext_model_dir).resolve()}') print(f'scored_data_output_dir: {scored_data_output_dir}') path_word_to_index = os.path.join(fasttext_model_dir, 'word_to_index.json') word_to_index = get_vocab(path_word_to_index) path_label = os.path.join(fasttext_model_dir, 'label.txt') map_id_label, map_label_id = get_id_label(path_label) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device) path = os.path.join(fasttext_model_dir, 'shared_params.json') with open(path, 'r', encoding='utf-8') as f: shared_params = json.load(f) path = os.path.join(fasttext_model_dir, 'BestModel') model = torch.load(f=path, map_location=device) def run(files): if len(files) == 0: return [] with torch.no_grad(): test_samples = load_dataset(file_path=files, max_len=shared_params['max_len'], ngram_size=shared_params['ngram_size'], word_to_index=word_to_index, map_label_id=map_label_id) test_iter = DataIter(samples=test_samples, batch_size=1, shuffle=False, device=device) results = predict_parallel(model, test_iter, map_id_label) dict_ = {'Filename': files, 'Class': results} df = pd.DataFrame(data=dict_) output_file = os.path.join(scored_data_output_dir, f"{uuid4().hex}.parquet") df.to_parquet(output_file, index=False) return results return run
def fasttext_evaluation(model_testing_result: OutputDirectory(), trained_model_dir: InputDirectory() = None, test_data_dir: InputDirectory() = None): print('=====================================================') print(f'trained_model_dir: {Path(trained_model_dir).resolve()}') print(f'test_data_dir: {Path(test_data_dir).resolve()}') path_word_to_index = os.path.join(test_data_dir, 'word_to_index.json') word_to_index = get_vocab(path_word_to_index) path_label = os.path.join(test_data_dir, 'label.txt') map_id_label, map_label_id = get_id_label(path_label) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device) path = os.path.join(trained_model_dir, 'shared_params.json') with open(path, 'r', encoding='utf-8') as f: shared_params = json.load(f) path = os.path.join(test_data_dir, 'data.txt') test_samples = load_dataset(file_path=path, max_len=shared_params['max_len'], ngram_size=shared_params['ngram_size'], word_to_index=word_to_index, map_label_id=map_label_id) test_iter = DataIter(samples=test_samples, shuffle=False, device=device) path = os.path.join(trained_model_dir, 'BestModel') model = torch.load(f=path, map_location=device) path = os.path.join(model_testing_result, 'result.json') acc_ = test(model, test_iter) with open(path, 'w', encoding='utf-8') as f: json.dump({"acc": acc_}, f) print('\n============================================')
def init(): global model, word_to_index, map_label_id, map_id_label, device, shared_params device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device) model_dir = os.getenv('AZUREML_MODEL_DIR') # Model.register() will upload the parent directory, so we need to include this direcotry in the path of model_dir # The name of this parent directory is the output port name. model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) path = os.path.join(model_dir, 'shared_params.json') with open(path, 'r', encoding='utf-8') as f: shared_params = json.load(f) path_word_to_index = os.path.join(model_dir, 'word_to_index.json') word_to_index = get_vocab(path_word_to_index) path_label = os.path.join(model_dir, 'label.txt') map_id_label, map_label_id = get_id_label(path_label) model_name = 'BestModel' model_path = os.path.join(model_dir, model_name) model = torch.load(f=model_path, map_location=device)
min_word_freq = 1 eval_interval = 1000 corpus_file = '../datasets/testdata.txt' print('\033[92m[ Hyper parameters ]\033[0m') print('- window_size:', window_size) print('- hidden_size:', hidden_size) print('- batch_size:', batch_size) print('- max_epoch:', max_epoch) print('- max_vocab_size:', max_vocab_size) print('- min_word_freq:', min_word_freq) print('- corpus:', corpus_file) print() train, word_to_id, id_to_word = get_vocab(corpus_file, max_vocab_size, min_word_freq) vocab_size = len(word_to_id) unk_rate = train.count(word_to_id['UNK']) / len(train) * 100.0 if 'UNK' in word_to_id.keys() else 0.0 print('\n\033[92m[ statics ]\033[0m') print('- token_size:', len(train)) print('- vocab_size:', vocab_size) print('- unk_rate: {:.2f}%'.format(unk_rate)) train_iter = WindowIterator(train, window_size, batch_size, max_epoch) model = CBOW(vocab_size, hidden_size, window_size, train) optimizer = Adam() trainer = Word2vecTrainer(model, optimizer) print('\n\033[92m[ progress ]\033[0m') trainer.fit(train_iter, eval_interval=eval_interval)
def fasttext_train(trained_model_dir: OutputDirectory(type='ModelDirectory'), training_data_dir: InputDirectory() = None, validation_data_dir: InputDirectory() = None, epochs=1, batch_size=64, max_len=32, embed_dim=300, hidden_size=256, ngram_size=200000, dropout=0.5, learning_rate=0.001): print('============================================') print('training_data_dir:', training_data_dir) print('validation_data_dir:', validation_data_dir) path_word_to_index = os.path.join(training_data_dir, 'word_to_index.json') word_to_index = get_vocab(path_word_to_index) path_label = os.path.join(training_data_dir, 'label.txt') map_id_label, map_label_id = get_id_label(path_label) class_num = len(map_id_label) vocab_size = len(word_to_index) stop_patience = 5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device) # load training dataset path = os.path.join(training_data_dir, 'data.txt') train_samples = load_dataset(file_path=path, word_to_index=word_to_index, map_label_id=map_label_id, max_len=max_len, ngram_size=ngram_size) train_iter = DataIter(samples=train_samples, batch_size=batch_size, shuffle=True, device=device) # load validation dataset path = os.path.join(validation_data_dir, 'data.txt') dev_samples = load_dataset(file_path=path, word_to_index=word_to_index, map_label_id=map_label_id, max_len=max_len, ngram_size=ngram_size) dev_iter = DataIter(samples=dev_samples, batch_size=batch_size, shuffle=True, device=device) model = FastText(vocab_size=vocab_size, class_num=class_num, dropout=dropout, embed_dim=embed_dim, hidden_size=hidden_size, ngram_size=ngram_size) # watch parameters print(model.parameters) # copy word_to_index.json and label.txt for later scoring. shutil.copy(src=path_word_to_index, dst=trained_model_dir) shutil.copy(src=path_label, dst=trained_model_dir) # shared parameters for loading dataset shared_params = {'max_len': max_len, 'ngram_size': ngram_size} path = os.path.join(trained_model_dir, 'shared_params.json') with open(path, 'w', encoding='utf-8') as f: json.dump(shared_params, f) start = time.time() train(model, trained_model_dir, train_iter=train_iter, dev_iter=dev_iter, epochs=epochs, learning_rate=learning_rate, stop_patience=stop_patience, device=device) end = time.time() print('\nduration of training process: %.2f sec' % (end - start)) print('============================================')