Ejemplo n.º 1
0
Archivo: train.py Proyecto: yxd886/fuse
    return True


try:
    records = load("records")
    info("load saved records")
except:
    records = get_all_data()
    info("no saved records")
    save(records, "records")

try:
    tests = load("tests")
    info("load saved tests")
except:
    tests = get_test_data()
    info("no saved tests")
    save(tests, "tests")

with tf.device("/gpu:0"):
    model = Model()

    try:
        model.load_weights('weights')
        info("load saved weight")
    except:
        info("no saved weight")

    optimizer = tf.keras.optimizers.Adam(learning_rate=.00004, clipnorm=6)
    L2_regularization_factor = .0001
    sample_size = 6
Ejemplo n.º 2
0
def main():
    args = parse_arguments(sys.argv[1:])
    set_seed(args['random_seed'])
    df = get_train_data()
    test_df = get_test_data()
    NUM_CLASSES = df['label'].nunique()

    train_texts, val_texts, train_labels, val_labels = train_test_split(df['sentence'], df['label_int'], random_state=args['random_seed'], test_size=.2)
    print(train_texts.shape, val_texts.shape, train_labels.shape, val_labels.shape)

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    train_encodings = tokenizer(train_texts.to_list(), truncation=True, padding=True)
    val_encodings = tokenizer(val_texts.to_list(), truncation=True, padding=True)
    test_encodings = tokenizer(test_df['sentence'].to_list(), truncation=True, padding=True)

    train_dataset = HINTDataset(train_encodings, train_labels.values)
    val_dataset = HINTDataset(val_encodings, val_labels.values)
    test_dataset = HINTDataset(test_encodings, test_df['label_int'].values)

    model = HINTModel(num_classes=NUM_CLASSES)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    model.to(device)
    model.ffn.train()

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    optim = AdamW(model.parameters(), lr=args['learning_rate'])
    loss_fn = nn.CrossEntropyLoss()

    step = 0
    best_acc = 0

    Path(args['model_dir']).mkdir(parents=True, exist_ok=True)

    for epoch in range(args['epochs']):
        train_loss, train_acc, train_f1 = train_fn(model, train_loader, loss_fn, optim, device)
        val_loss, val_acc, val_f1 = val_fn(model, val_loader, loss_fn, device)
        print(f"{epoch+1}: train: [{train_loss:.3f}, {train_acc:.3f}, {train_f1:.3f}], val: [{val_loss:.3f}, {val_acc:.3f}, {val_f1:.3f}]")
        if val_acc > best_acc:
            best_acc = val_acc
            step = 0
            torch.save(model.state_dict(), f"{args['model_dir']}/{args['model_path']}")
        else:
            step += 1
        if step >= args['max_steps']:
            break

    model.load_state_dict(torch.load(f"{args['model_dir']}/{args['model_path']}", map_location=device))
    print("model successfully loaded!")
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    preds, probs = inference_fn(model, test_loader, device)
    test_df['preds'] = preds
    test_df['probs'] = probs
    test_df['label_int'] = test_df['label_int'].fillna(NUM_CLASSES + 1)
    test_df['updated_preds'] = test_df['preds']
    test_df.loc[test_df['probs'] <= args['min_prob'], 'updated_preds'] = NUM_CLASSES + 1

    Path(args['output_dir']).mkdir(parents=True, exist_ok=True)
    test_df.to_csv(f"{args['output_dir']}/{args['test_file_name']}", index=False)
    
    acc1 = accuracy_score(test_df['label_int'], test_df['preds'])
    acc2 = accuracy_score(test_df['label_int'], test_df['updated_preds'])

    f11 = f1_score(test_df['label_int'], test_df['preds'], average='weighted')
    f12 = f1_score(test_df['label_int'], test_df['updated_preds'], average='weighted')

    print(f"Default: acc: {acc1}, f1_score: {f11}")
    print(f"Updated with Min Prob: acc: {acc2}, f1_score: {f12}")
Ejemplo n.º 3
0
def test_on_forward_LSTM(model, classifier=MLPClassifier()):
    # '''Generate Sentence embedding with forward model'''
    sent1_train_indices, sent2_train_indices, word_to_index, index_to_word, label_train = data.get_train_data(
        VOCABULARY_SIZE)
    first_train_sentences = generate_sentence_embeddings(
        model, sent1_train_indices)
    second_train_sentences = generate_sentence_embeddings(
        model, sent2_train_indices)
    assert len(first_train_sentences) == len(second_train_sentences)

    # '''generate feature vector by all pair comparison, then pooling'''
    feature_vector_train = generate_feature_vector(first_train_sentences,
                                                   second_train_sentences)
    print("Train data Shape : ", feature_vector_train.shape)

    # '''Generate test data embeddings'''
    sent1_test_indices, sent2_test_indices, word_to_index, index_to_word, label_test = data.get_test_data(
        VOCABULARY_SIZE)
    first_test_sentences = generate_sentence_embeddings(
        model, sent1_test_indices)
    second_test_sentences = generate_sentence_embeddings(
        model, sent2_test_indices)
    assert len(first_test_sentences) == len(second_test_sentences)

    # '''Generate feature vector for test; all pair comparison then pooling'''
    feature_vector_test = generate_feature_vector(first_test_sentences,
                                                  second_test_sentences)
    #    print(feature_vector_test[0])
    print("Test data Shape : ", feature_vector_test.shape)

    # '''Building the Fully connected layer'''
    build_classifier_and_test(feature_vector_train,
                              label_train,
                              feature_vector_test,
                              label_test,
                              classifier,
                              print_train_result=False)
Ejemplo n.º 4
0
def test_on_bidirectional_lstm(forward_model,
                               backward_model,
                               classifier=MLPClassifier()):
    # '''Generate Sentence embedding with forward model'''
    sent1_train_indices, sent2_train_indices, word_to_index, index_to_word, label_train = data.get_train_data(
        VOCABULARY_SIZE)
    first_train_sentences = generate_sentence_embeddings(
        forward_model, sent1_train_indices)
    second_train_sentences = generate_sentence_embeddings(
        forward_model, sent2_train_indices)

    # '''Generate Sentence embedding with backward model'''
    first_train_sentences_r = generate_sentence_embeddings(
        backward_model, sent1_train_indices)
    second_train_sentences_r = generate_sentence_embeddings(
        backward_model, sent2_train_indices)

    # '''Combine first train sentence (forward and backward embedding)'''
    first_train_sentences_combined = combine_forward_and_backward_vectors(
        first_train_sentences, first_train_sentences_r)

    # '''Combine second train sentence (forward and backward embedding)'''
    second_train_sentences_combined = combine_forward_and_backward_vectors(
        second_train_sentences, second_train_sentences_r)
    assert len(first_train_sentences_combined) == len(
        second_train_sentences_combined)

    # '''generate feature vector by all pair comparison, then pooling'''
    feature_vector_train = generate_feature_vector(
        first_train_sentences_combined, second_train_sentences_combined)
    print("Train data Shape : ", feature_vector_train.shape)

    # '''Generate test data embeddings'''
    sent1_test_indices, sent2_test_indices, word_to_index, index_to_word, label_test = data.get_test_data(
        VOCABULARY_SIZE)
    first_test_sentences = generate_sentence_embeddings(
        forward_model, sent1_test_indices)
    second_test_sentences = generate_sentence_embeddings(
        forward_model, sent2_test_indices)

    # '''Generate test data embedding backward'''
    first_test_sentences_r = generate_sentence_embeddings(
        backward_model, sent1_test_indices)
    second_test_sentences_r = generate_sentence_embeddings(
        backward_model, sent2_test_indices)

    # '''combine first sentence test embedding (forward and backward)'''
    first_test_sentences_combined = combine_forward_and_backward_vectors(
        first_test_sentences, first_test_sentences_r)

    # '''combine second sentence test embedding (forward and backward)'''
    second_test_sentences_combined = combine_forward_and_backward_vectors(
        second_test_sentences, second_test_sentences_r)
    assert len(first_test_sentences_combined) == len(
        second_test_sentences_combined)

    # '''Generate feature vector for test; all pair comparison then pooling'''
    feature_vector_test = generate_feature_vector(
        first_test_sentences_combined, second_test_sentences_combined)
    print("Test data Shape : ", feature_vector_test.shape)

    # '''Building the Fully connected layer'''
    build_classifier_and_test(feature_vector_train,
                              label_train,
                              feature_vector_test,
                              label_test,
                              classifier,
                              print_train_result=False)
Ejemplo n.º 5
0
import numpy as np
from data import input_neurons, hidden_neurons, output_neurons
from data import get_test_data, get_training_data
from network import Network

network = Network([input_neurons, hidden_neurons, output_neurons])
# uncomment this stretch to start the network with a great test result
# saved_weights = np.load("weights.npy",mmap_mode=None, allow_pickle=True)
# network.weights = saved_weights

training_inputs = get_training_data()
test_inputs = get_test_data()

network.start_training(training_inputs,
                       1000,
                       learning_rate=0.7,
                       test_inputs=test_inputs)

print("\nRESULTADOS:")
for x, single_test in enumerate(test_inputs):
    if x == 0:
        print("\nSEM RUIDOS:")
    if x == 34:
        print("\nRUIDO MÍNIMO:")
    if x == 54:
        print("\nRUIDO MÉDIO:")
    if x == 74:
        print("\nRUIDO AVANÇADO:")
    if x == 94:
        print("\nNÃO FAZEM PARTE:")
    network.identify(single_test, log=True)