def fasttext_score(
        scored_data_output_dir: OutputDirectory(),
        fasttext_model_dir: InputDirectory() = '.'
):
    print('=====================================================')
    print(f'fasttext_model: {Path(fasttext_model_dir).resolve()}')
    print(f'scored_data_output_dir: {scored_data_output_dir}')
    path_word_to_index = os.path.join(fasttext_model_dir, 'word_to_index.json')
    word_to_index = get_vocab(path_word_to_index)
    path_label = os.path.join(fasttext_model_dir, 'label.txt')
    map_id_label, map_label_id = get_id_label(path_label)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    path = os.path.join(fasttext_model_dir, 'shared_params.json')
    with open(path, 'r', encoding='utf-8') as f:
        shared_params = json.load(f)
    path = os.path.join(fasttext_model_dir, 'BestModel')
    model = torch.load(f=path, map_location=device)

    def run(files):
        if len(files) == 0:
            return []
        with torch.no_grad():
            test_samples = load_dataset(file_path=files, max_len=shared_params['max_len'],
                                        ngram_size=shared_params['ngram_size'], word_to_index=word_to_index,
                                        map_label_id=map_label_id)
            test_iter = DataIter(samples=test_samples, batch_size=1, shuffle=False, device=device)
            results = predict_parallel(model, test_iter, map_id_label)
            dict_ = {'Filename': files, 'Class': results}
            df = pd.DataFrame(data=dict_)
            output_file = os.path.join(scored_data_output_dir, f"{uuid4().hex}.parquet")
            df.to_parquet(output_file, index=False)
        return results

    return run
Esempio n. 2
0
def fasttext_evaluation(model_testing_result: OutputDirectory(),
                        trained_model_dir: InputDirectory() = None,
                        test_data_dir: InputDirectory() = None):
    print('=====================================================')
    print(f'trained_model_dir: {Path(trained_model_dir).resolve()}')
    print(f'test_data_dir: {Path(test_data_dir).resolve()}')
    path_word_to_index = os.path.join(test_data_dir, 'word_to_index.json')
    word_to_index = get_vocab(path_word_to_index)
    path_label = os.path.join(test_data_dir, 'label.txt')
    map_id_label, map_label_id = get_id_label(path_label)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    path = os.path.join(trained_model_dir, 'shared_params.json')
    with open(path, 'r', encoding='utf-8') as f:
        shared_params = json.load(f)
    path = os.path.join(test_data_dir, 'data.txt')
    test_samples = load_dataset(file_path=path,
                                max_len=shared_params['max_len'],
                                ngram_size=shared_params['ngram_size'],
                                word_to_index=word_to_index,
                                map_label_id=map_label_id)
    test_iter = DataIter(samples=test_samples, shuffle=False, device=device)
    path = os.path.join(trained_model_dir, 'BestModel')
    model = torch.load(f=path, map_location=device)
    path = os.path.join(model_testing_result, 'result.json')
    acc_ = test(model, test_iter)
    with open(path, 'w', encoding='utf-8') as f:
        json.dump({"acc": acc_}, f)
    print('\n============================================')
def init():
    global model, word_to_index, map_label_id, map_id_label, device, shared_params
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    model_dir = os.getenv('AZUREML_MODEL_DIR')
    # Model.register() will upload the parent directory, so we need to include this direcotry in the path of model_dir
    # The name of this parent directory is the output port name.
    model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
    path = os.path.join(model_dir, 'shared_params.json')
    with open(path, 'r', encoding='utf-8') as f:
        shared_params = json.load(f)
    path_word_to_index = os.path.join(model_dir, 'word_to_index.json')
    word_to_index = get_vocab(path_word_to_index)
    path_label = os.path.join(model_dir, 'label.txt')
    map_id_label, map_label_id = get_id_label(path_label)
    model_name = 'BestModel'
    model_path = os.path.join(model_dir, model_name)
    model = torch.load(f=model_path, map_location=device)
Esempio n. 4
0
min_word_freq = 1
eval_interval = 1000

corpus_file = '../datasets/testdata.txt'

print('\033[92m[ Hyper parameters ]\033[0m')
print('- window_size:', window_size)
print('- hidden_size:', hidden_size)
print('- batch_size:', batch_size)
print('- max_epoch:', max_epoch)
print('- max_vocab_size:', max_vocab_size)
print('- min_word_freq:', min_word_freq)
print('- corpus:', corpus_file)
print()

train, word_to_id, id_to_word = get_vocab(corpus_file, max_vocab_size, min_word_freq)
vocab_size = len(word_to_id)
unk_rate = train.count(word_to_id['UNK']) / len(train) * 100.0 if 'UNK' in word_to_id.keys() else 0.0

print('\n\033[92m[ statics ]\033[0m')
print('- token_size:', len(train))
print('- vocab_size:', vocab_size)
print('- unk_rate: {:.2f}%'.format(unk_rate))

train_iter = WindowIterator(train, window_size, batch_size, max_epoch)
model = CBOW(vocab_size, hidden_size, window_size, train)
optimizer = Adam()
trainer = Word2vecTrainer(model, optimizer)

print('\n\033[92m[ progress ]\033[0m')
trainer.fit(train_iter, eval_interval=eval_interval)
Esempio n. 5
0
def fasttext_train(trained_model_dir: OutputDirectory(type='ModelDirectory'),
                   training_data_dir: InputDirectory() = None,
                   validation_data_dir: InputDirectory() = None,
                   epochs=1,
                   batch_size=64,
                   max_len=32,
                   embed_dim=300,
                   hidden_size=256,
                   ngram_size=200000,
                   dropout=0.5,
                   learning_rate=0.001):
    print('============================================')
    print('training_data_dir:', training_data_dir)
    print('validation_data_dir:', validation_data_dir)
    path_word_to_index = os.path.join(training_data_dir, 'word_to_index.json')
    word_to_index = get_vocab(path_word_to_index)
    path_label = os.path.join(training_data_dir, 'label.txt')
    map_id_label, map_label_id = get_id_label(path_label)
    class_num = len(map_id_label)
    vocab_size = len(word_to_index)
    stop_patience = 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    # load training dataset
    path = os.path.join(training_data_dir, 'data.txt')
    train_samples = load_dataset(file_path=path,
                                 word_to_index=word_to_index,
                                 map_label_id=map_label_id,
                                 max_len=max_len,
                                 ngram_size=ngram_size)
    train_iter = DataIter(samples=train_samples,
                          batch_size=batch_size,
                          shuffle=True,
                          device=device)
    # load validation dataset
    path = os.path.join(validation_data_dir, 'data.txt')
    dev_samples = load_dataset(file_path=path,
                               word_to_index=word_to_index,
                               map_label_id=map_label_id,
                               max_len=max_len,
                               ngram_size=ngram_size)
    dev_iter = DataIter(samples=dev_samples,
                        batch_size=batch_size,
                        shuffle=True,
                        device=device)

    model = FastText(vocab_size=vocab_size,
                     class_num=class_num,
                     dropout=dropout,
                     embed_dim=embed_dim,
                     hidden_size=hidden_size,
                     ngram_size=ngram_size)
    # watch parameters
    print(model.parameters)
    # copy word_to_index.json and label.txt for later scoring.
    shutil.copy(src=path_word_to_index, dst=trained_model_dir)
    shutil.copy(src=path_label, dst=trained_model_dir)
    # shared parameters for loading dataset
    shared_params = {'max_len': max_len, 'ngram_size': ngram_size}
    path = os.path.join(trained_model_dir, 'shared_params.json')
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(shared_params, f)
    start = time.time()
    train(model,
          trained_model_dir,
          train_iter=train_iter,
          dev_iter=dev_iter,
          epochs=epochs,
          learning_rate=learning_rate,
          stop_patience=stop_patience,
          device=device)
    end = time.time()
    print('\nduration of training process: %.2f sec' % (end - start))
    print('============================================')