Ejemplo n.º 1
0
def main():
    # Load model
    model = BaselineModel(joblib.load('kfold_trained.joblib'))
    cached_model = CachedModel('data/cache/baseline.pkl', model)

    body_id = 2380
    headline = "Mystery of 50ft giant crab caught on camera in Kent harbour"
    stance = "discuss"

    print('Stance: {}'.format(stance))
    print('Headline: "{}"'.format(headline))

    test_dataset = DataSet(name="competition_test",
                           path="fnc_1_baseline/fnc-1")

    body = test_dataset.articles[body_id]
    true_label_id = LABELS.index(stance)

    tokens = tokenize(body)

    original_probabilities = model.predict_probabilities([headline], [body])[0]

    contributions = calculate_contributions(cached_model,
                                            original_probabilities, tokens,
                                            headline, body_id, true_label_id)
    by_index = {}

    for i, c in contributions:
        by_index[i] = c

    with_contributions = [(t, by_index.get(i)) for i, t in enumerate(tokens)]
    print(with_contributions)
Ejemplo n.º 2
0
def main():
    # Load dataset
    dataset = DataSet(name="filtered_test", path="data")

    print('Replacing {} words in each example'.format(N_CHANGES))

    # Load model
    model = BaselineModel(joblib.load('kfold_trained.joblib'))
    cached_model = CachedModel('data/cache/baseline.pkl', model)

    changes_counts = []
    transformed_examples = []

    # Transform each example
    stances = [(i, stance) for i, stance in enumerate(dataset.stances)]
    for new_body_id, stance in tqdm(stances):
        try:
            headline = stance['Headline']
            body_id = stance['Body ID']
            original_body = dataset.articles[body_id]
            true_label_id = LABELS.index(stance['Stance'])

            new_body, changes = construct_example(cached_model, original_body,
                                                  body_id, headline,
                                                  true_label_id)
            transformed_examples.append({
                "Body ID": new_body_id,
                "articleBody": new_body,
                "Stance": LABELS[true_label_id],
                "Headline": headline,
                "Original body ID": body_id,
                "originalBody": original_body,
                "changes": changes,
            })
            changes_counts.append(len(changes))
        except Exception as e:
            print("Error for row {}: {}".format(new_body_id, e))

    with_changes = [c for c in changes_counts if c > 0]

    cached_model.save()  # Save the cache
    write_csvs(transformed_examples)
Ejemplo n.º 3
0
    pretrained_embeddings = load_pretrained_embeddings(embeddings_path,
                                                       train_dataset.word2idx,
                                                       300, is_crf=crf_model)

    name_ = 'LSTM'
    hp = HyperParameters(name_, train_dataset.word2idx,
                         train_dataset.labels2idx,
                         pretrained_embeddings,
                         batch_size)

    # , collate_fn=DatasetParser.pad_collate
    train_dataset_ = DataLoader(dataset=train_dataset, batch_size=batch_size)
    dev_dataset_ = DataLoader(dataset=dev_dataset, batch_size=batch_size)
    test_dataset_ = DataLoader(dataset=test_dataset, batch_size=batch_size)

    model = BaselineModel(hp).to(train_dataset.get_device)
    trainer = Trainer(
        model=model,
        loss_function=CrossEntropyLoss(ignore_index=train_dataset.labels2idx['<PAD>']),
        optimizer=Adam(model.parameters()),
        batch_num=hp.batch_size,
        num_classes=hp.num_classes,
        verbose=True
    )

    save_to_ = join(RESOURCES_PATH, f"{model.name}_model.pt")
    trainer.train(train_dataset_, dev_dataset_, epochs=1, save_to=save_to_)

    evaluator = Evaluator(model, test_dataset_, crf_model)
    evaluator.check_performance(train_dataset.idx2label)
Ejemplo n.º 4
0
        lambda row: normalize(row, maximums, minimums), axis=1)
    print(data.columns)
    for month in range(1, 13):
        for day in range(1, 32):
            data_by_day = data[data['month'] == month]
            data_by_day = data_by_day[data_by_day['day'] == day]
            if len(data_by_day) > 0:
                by_day_y.append(
                    statistics.mean(data_by_day['normalized_volume']))
                by_day_x.append(f'{month}-{day}')
    plt.plot(by_day_x, by_day_y)
    print('weak days')
    for day, mean_volume in zip(by_day_x, by_day_y):
        if mean_volume < 0.35:
            print(day)
    print('peak days')
    for day, mean_volume in zip(by_day_x, by_day_y):
        if mean_volume > 0.55:
            print(day)

    plt.show()


dataset = Dataset(file_name)
model = BaselineModel(dataset, {})
for year in range(2002, 2017, 2):
    predictions = model.predict(f'{year}-01-01', f'{year+1}-12-31')
    gold = dataset.get_subset(f'{year}-01-01', f'{year+1}-12-31')['Volume']
    mse, r2 = evaluate(gold, predictions)
    print(f'{year}, {year+1}: MSE: {mse}, R2: {r2}')
Ejemplo n.º 5
0
Archivo: main.py Proyecto: spamz23/GFD
def main():

    # Parse arguments
    args = get_parser().parse_args()

    # Check for GPU
    is_gpu = len(tf.config.list_physical_devices("GPU")) > 0

    if not is_gpu and not args.allow_cpu:
        raise ValueError(
            "Cannot run the code on CPU. Please enable GPU support or pass the '--allow-cpu' flag."
        )

    # Check if dataset should be recreated
    if not os.path.exists("cleaned_dataset") or args.force_recreate:
        prepare_data()

    # ........................................................................
    # Let's select the train folds and the holdout fold
    # ........................................................................
    # Grab folds folders (fold_1, ..., fold_k)
    folds_folders = glob.glob("patched_dataset/**")

    # Randomly select 1 fold for being the holdout final test
    holdout_fold = np.random.choice(folds_folders, 1)[0]

    # Keep the rest for training and performing k-fold
    # Search for elements of `folds_folders` that are not in `holdout_fold`
    train_folds = np.setdiff1d(folds_folders, holdout_fold)

    print(f"Train folds: {train_folds}")
    print(f"Holdout fold: {holdout_fold}")

    for k, fold in enumerate(train_folds):
        # Print current fold
        print(f"Train Fold {k+1}:")
        print_fold_stats(fold)

    # Now for the holdout fold
    print("Holdout Fold:")
    print_fold_stats(holdout_fold)

    # Generate the bands to keep
    bands_to_keep = np.round(np.linspace(0, 272 - 1, args.bands)).astype(int)

    # Load test images and labels
    print("Loading holdout fold images")
    images, labels = load_fold(holdout_fold, bands_to_keep)
    test_data = (images, labels)
    # Creat list for holding the training datasets folds
    train_data_list = []
    for k, fold in enumerate(train_folds):
        print(f"Loading images of training fold {k+1}")
        # Load the normalized images to list
        images, labels = load_fold(fold, bands_to_keep)
        # Save images in dict with labels as value
        train_data_list.append([images, labels])

    # Check if a baseline model should be created
    if args.baseline:
        # ........................................................................
        # Let's now establish a baseline model
        # ........................................................................
        model = BaselineModel().cross_validate(
            train_data_list,
            fully_connected_size=128,
            add_dropout_layers=True,
        )

        # How does the model perform on unseen data?
        result = model.evaluate(test_data[0], test_data[1], batch_size=64)
        result = dict(zip(model.metrics_names, result))
        print(result)

    # ........................................................................
    # Let's now try to use the Autoencoder
    # ........................................................................
    # Get train data (We only care about the features now)
    x_train, _ = build_train_data(train_data_list, range(len(train_data_list)))
    x_test = test_data[0]

    print("Training an Autoencoder")
    # Instantiate autoencoder
    autoencoder = Autoencoder(x_train[0].shape, n_bands=args.bands)

    # Prepare callbacks
    # 1. Reduce learning rate when loss is on plateau
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss",
                                                     factor=0.2,
                                                     patience=15,
                                                     min_lr=9e-4,
                                                     verbose=1)
    # 2. Early stopping
    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=0,
        patience=20,
        mode="auto",
        verbose=0,
        restore_best_weights=True,
    )

    # Compile the autoencoder
    autoencoder.compile(
        optimizer=tf.keras.optimizers.Adam(lr=1e-3),
        loss="mse",
    )

    # Train the model
    history = {}
    history["autoencoder"] = autoencoder.fit(
        x_train,
        x_train,
        validation_split=0.2,
        batch_size=16,
        epochs=250,
        verbose=1,
        callbacks=[reduce_lr, early_stop],
    )
    # Plot the Autoencoder loss curve
    plotter = tfdocs.plots.HistoryPlotter(metric="loss", smoothing_std=10)
    plotter.plot(history)
Ejemplo n.º 6
0
def main(args):
    source_dataset = VideoFeaturesDataset(args.features_folder_source,
                                          list_file=args.list_file_source,
                                          num_frames=args.num_frames,
                                          sampling_strategy='TSNTrain')
    num_classes = source_dataset.num_classes

    target_dataset = VideoFeaturesDataset(args.features_folder_target,
                                          list_file=args.list_file_target,
                                          num_frames=args.num_frames,
                                          sampling_strategy='TSNTrain')

    num_samples = min(len(source_dataset), len(target_dataset))
    print(num_samples)
    source_dataset = DataLoader(source_dataset,
                                args.bs,
                                shuffle=False,
                                num_workers=args.num_workers,
                                drop_last=True,
                                sampler=SeqRandomSampler(
                                    source_dataset, num_samples=num_samples))
    target_dataset = DataLoader(target_dataset,
                                args.bs,
                                shuffle=False,
                                num_workers=args.num_workers,
                                drop_last=True,
                                sampler=SeqRandomSampler(
                                    target_dataset, num_samples=num_samples))

    val_target_dataset = VideoFeaturesDataset(args.features_folder_target,
                                              list_file=args.list_file_val,
                                              num_frames=args.num_frames,
                                              sampling_strategy='TSNVal')

    val_target_dataset = DataLoader(val_target_dataset,
                                    args.bs,
                                    shuffle=False,
                                    drop_last=False)

    model = BaselineModel(dial=args.dial,
                          bn_last=args.bn_last,
                          num_classes=num_classes)
    optimizer = torch.optim.Adam(
        model.parameters(), args.lr,
        weight_decay=1e-4)  #, momentum=0.9, nesterov=True)
    schedule = torch.optim.lr_scheduler.MultiStepLR(
        optimizer=optimizer, milestones=[args.num_epochs // 2])

    model = model.cuda()

    for epoch in range(args.num_epochs):
        print('Starting epoch %d / %d ' % (epoch + 1, args.num_epochs))

        loss_dict = run_epoch(model, source_dataset, target_dataset, optimizer,
                              args)
        print(', '.join(key + ': ' + str(value)
                        for key, value in loss_dict.items()))

        source_acc = check_accuracy(model, source_dataset)
        target_acc = check_accuracy(model, target_dataset)
        val_acc = check_accuracy(model, val_target_dataset)

        schedule.step(epoch)

        print('Source acc: %f, Target acc: %f, Train Val acc: %f' %
              (source_acc, target_acc, val_acc))

    return val_acc
Ejemplo n.º 7
0
                                                       train_dataset.word2idx,
                                                       300, is_crf=crf_model)

    name_ = 'LSTM_CRF' if crf_model else 'LSTM'
    hp = HyperParameters(name_, train_dataset.word2idx,
                         train_dataset.labels2idx,
                         pretrained_embeddings,
                         batch_size)

    # train_dataset_ = DataLoader(dataset=train_dataset, batch_size=batch_size, collate_fn=TSVDatasetParser.pad_per_batch)
    train_dataset_ = DataLoader(dataset=train_dataset, batch_size=batch_size)
    dev_dataset_ = DataLoader(dataset=dev_dataset, batch_size=batch_size)
    test_dataset_ = DataLoader(dataset=test_dataset, batch_size=batch_size)

    if not crf_model:
        model = BaselineModel(hp).to(train_dataset.get_device)
        print(f'\n========== Model Summary ==========\n{torch_summarize(model)}')

        trainer = Trainer(
            model=model,
            loss_function=CrossEntropyLoss(ignore_index=train_dataset.labels2idx['<PAD>']),
            optimizer=Adam(model.parameters()),
            batch_num=hp.batch_size,
            num_classes=hp.num_classes,
            verbose=True
        )
        save_to_ = join(RESOURCES_PATH, f"{model.name}_model.pt")
        trainer.train(train_dataset_, dev_dataset_, epochs=1, save_to=save_to_)
    else:
        model = CRF_Model(hp).to(train_dataset.get_device)
        print(f'========== Model Summary ==========\n{torch_summarize(model)}')
Ejemplo n.º 8
0
def run(options):
    # load dataset
    dataset = load_dataset(
        os.path.join(abs_output_data_folderpath, "processed_dataset.pkl"))

    mldata = MLDataset(dataset, 10)
    #train_inputs, train_outputs, val_inputs, val_outputs = preprocess_data(dataset)

    mlflow.keras.autolog()
    # setup model
    training = True
    if options.action == 'training':
        model = BaselineModel()
        model._functional_setup()
        train_inputs, train_outputs, val_inputs, val_outputs = mldata.get_kth_fold(
            0)
        model.train(train_inputs, train_outputs, val_inputs, val_outputs)
        mlflow.keras.save_model(model.model, "models")

    if options.action == 'optimize':
        model = BaselineModel()
        optimizer = Optimizer(BaselineModel, mldata)
        optimizer.hyper_parameter_opt()
        best_model = optimizer.best_model

    if options.action == 'predict':
        model = BaselineModel()
        model.load_model("models/")

        preds = model.predict(val_inputs[0:20])
        for i in range(20):
            plt.figure()
            plt.plot(preds[i], label="pred")
            plt.plot(val_outputs[i], label="real")
            plt.savefig(f"test{i}.png")
Ejemplo n.º 9
0
import argparse

from time import time
from sklearn.externals import joblib
from sacremoses import MosesTokenizer, MosesDetokenizer

from utils.nlp import find_synonym, tokenize_and_tag
from models import BaselineModel, CachedModel
from nltk.corpus import wordnet as wn
from utils.synonyms import construct_example

DETOKENIZER = MosesDetokenizer()
N_CHANGES = 4

# Load model
model = BaselineModel(joblib.load('kfold_trained.joblib'))
cached_model = CachedModel('data/cache/baseline.pkl', model)
shared_list = list()

# pre-load WordNet
print(find_synonym("adversarial", wn.ADJ))


def write_csvs(transformed_examples, t):
    # t =round(time())
    with open('data/{}_baseline_bodies.csv'.format(t), 'w',
              encoding='utf-8') as csvfile:
        fieldnames = ['Body ID', 'articleBody', 'Original body ID']
        writer = csv.DictWriter(csvfile,
                                fieldnames=fieldnames,
                                extrasaction='ignore')
    train_dataset_ = DataLoader(dataset=train_dataset, batch_size=batch_size)
    dev_dataset_ = DataLoader(dataset=dev_dataset, batch_size=batch_size)
    test_dataset_ = DataLoader(dataset=test_dataset, batch_size=batch_size)

    embeddings_path = os.path.join(RESOURCES_PATH, 'wiki.en.vec')
    pretrained_embeddings = load_pretrained_embeddings(
        embeddings_path, train_dataset.word2idx, 300,
        is_crf=CRF_MODEL) if PRETRAINED else None

    idx2label = load_pickle(
        os.path.join(RESOURCES_PATH,
                     'Stacked_BiLSTM_CRF_Fasttext_2315_idx2label.pkl'))
    word2idx = load_pickle(
        os.path.join(RESOURCES_PATH,
                     'Stacked_BiLSTM_CRF_Fasttext_2315_word2idx.pkl'))
    hp = HyperParameters(name_, word2idx, train_dataset.idx2label,
                         pretrained_embeddings, batch_size)

    model = CRF_Model(hp).to(
        train_dataset.get_device) if CRF_MODEL else BaselineModel(hp).to(
            train_dataset.get_device)
    model.load_model(model_path)

    evaluator = Evaluator(model, test_dataset_, CRF_MODEL)
    evaluator.check_performance(idx2label)
    tokens = test_dataset.data_x
    preds_lst = model.predict_sentences(tokens, word2idx, idx2label)
    with open('preds.txt', encoding='utf-8', mode='w+') as f:
        for lst in preds_lst:
            f.write(f"{str(lst)}\n")
Ejemplo n.º 11
0
                save_to=save_to)

        if pretrained_pos_embeddings:
            # model = train_pos2vec(training_set.pos_x, 10, 300, 1e-3, 30)
            pos_embeddings_path = os.path.join(os.getcwd(), 'model',
                                               "pos_embeddings.npy")
            # save_pos_embeddings(model, pos_embeddings_path)
            pos_embeddings_ = load_pos_embeddings(pos_embeddings_path, pos2idx,
                                                  300)

        hp = HyperParameters(name_, word2idx, label2idx, pos2idx,
                             pretrained_embeddings_, batch_size)
        hp._print_info()

        # Create and train model
        model = BaselineModel(hp).to(device)
        model.print_summary()

        log_path = os.path.join(os.getcwd(), 'runs', hp.model_name)
        writer_ = WriterTensorboardX(log_path, logger=logging, enable=True)

        trainer = Trainer(
            model=model,
            writer=writer_,
            loss_function=CrossEntropyLoss(ignore_index=label2idx['<PAD>']),
            optimizer=Adam(model.parameters()),
            epochs=50,
            num_classes=hp.num_classes,
            verbose=True)

        save_to_ = os.path.join(os.getcwd(), 'model', f"{model.name}_model.pt")