Ejemplo n.º 1
0
def load(FLAGS):
    """
    Load all data and store it in either a list (old) or in a dataset class (new)
    """
    questions = []
    queries = []
    answers = []
    impression_lvls = []
    engagement_lvls = []
    click_probs = []

    np.random.seed(42)

    filename_dataset = f"Data/dataset_filename={FLAGS.filename}_expanded={FLAGS.expanded}_balance={FLAGS.balance}_impression={FLAGS.impression}_reduced_classes={FLAGS.reduced_classes}_embedder={FLAGS.embedder}_negative_samples={FLAGS.negative_samples}.p"

    # Check if loadable file exists
    if not os.path.exists(FLAGS.folder):
        raise OSError(f"Folder {FLAGS.folder} does not exist")
    if not os.path.exists(FLAGS.folder + FLAGS.filename):
        raise OSError(f"File {FLAGS.folder+FLAGS.filename} does not exist")

    N = 500

    with open(FLAGS.folder + FLAGS.filename) as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter="\t")
        # skip the first line (consists of labels)
        next(tsvreader, None)

        for i, line in enumerate(tsvreader):
            # skip the instances that have a low impression level
            if FLAGS.impression and line[7] == "low":
                continue

            # if i == N:
            #     break

            # Add values to the data lists
            queries.append(line[0])
            questions.append(line[1])
            answers.append([line[i] for i in range(2, 7)])
            impression_lvls.append(line[7])
            if FLAGS.reduced_classes:
                engagement_lvls.append(0 if int(line[8]) == 0 else 1)
            else:
                engagement_lvls.append(int(line[8]))
            click_probs.append([float(line[i]) for i in range(9, 14)])

    # Attempt to fix class imbalance assuming 0 is to large
    if FLAGS.balance:
        # Index the locations of zeros and non-zeros
        engagement_lvls = np.array(engagement_lvls)
        zero_indices = np.where(engagement_lvls == 0)[0]
        non_zero_indices = np.where(engagement_lvls != 0)[0]

        # Get the median size of the engagement levels
        if FLAGS.reduced_classes:
            median_size = int(Counter(engagement_lvls)[1])
        else:
            median_size = int(
                np.median(list(Counter(engagement_lvls).values())))

        # Return the to be used indices
        sampled_indices = np.random.choice(zero_indices,
                                           median_size,
                                           replace=False)
        indices = np.concatenate((sampled_indices, non_zero_indices))

        # Update datalist based on indices
        queries = [queries[i] for i in indices]
        questions = [questions[i] for i in indices]
        answers = [answers[i] for i in indices]
        impression_lvls = [impression_lvls[i] for i in indices]
        engagement_lvls = [engagement_lvls[i] for i in indices]
        click_probs = [click_probs[i] for i in indices]

    if FLAGS.expanded and FLAGS.negative_samples:
        # Get values for sampling
        n_questions = len(questions)
        ranges = get_ranges(queries)
        sampled_question_indices = []

        for r in ranges:
            # Negative samples for each query range
            samples = np.random.choice(
                [i for i in range(n_questions) if i not in r],
                FLAGS.sample_size,
                replace=False)
            sampled_question_indices.append(samples)

            # Update the engagement levels to 2 for max engagement and 1 for other
            max_engagement = np.max([engagement_lvls[i] for i in r])
            for i in r:
                if engagement_lvls[i] == max_engagement:
                    engagement_lvls[i] = 2
                else:
                    engagement_lvls[i] = 1

    # set language model
    if FLAGS.embedder == "Bert":
        # Flatten to load into embedder
        answers = [i for sublist in answers for i in sublist]

        embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

        question_embeds = embedder.encode(questions,
                                          convert_to_tensor=False,
                                          show_progress_bar=True,
                                          batch_size=128,
                                          num_workers=4)

        query_embeds = embedder.encode(queries,
                                       convert_to_tensor=False,
                                       show_progress_bar=True,
                                       batch_size=128,
                                       num_workers=4)

        answer_embeds = embedder.encode(answers,
                                        convert_to_tensor=False,
                                        show_progress_bar=True,
                                        batch_size=128,
                                        num_workers=4)

        query_embeds = torch.from_numpy(query_embeds)
        question_embeds = torch.from_numpy(question_embeds)
        answer_embeds = torch.from_numpy(answer_embeds)

        print(query_embeds.shape)
        print(question_embeds.shape)
        print(answer_embeds.shape)

        answers = list(zip(*[iter(answers)] * 5))

        if FLAGS.expanded and FLAGS.negative_samples:
            # Make list to extend the embeddings
            answer_embeds = list(
                answer_embeds.reshape(query_embeds.shape[0], -1))
            question_embeds = list(question_embeds)
            query_embeds = list(query_embeds)

            # Extend the data with the negative samples
            for r, samples in zip(ranges, sampled_question_indices):
                queries.extend([queries[r[0]]] * len(samples))
                questions.extend([questions[i] for i in samples])
                answers.extend([answers[i] for i in samples])
                impression_lvls.extend([impression_lvls[i] for i in samples])
                engagement_lvls.extend([0] * len(samples))
                click_probs.extend([click_probs[i] for i in samples])
                query_embeds.extend([query_embeds[r[0]]] * len(samples))
                question_embeds.extend([question_embeds[i] for i in samples])
                answer_embeds.extend([answer_embeds[i] for i in samples])

            # Turn the embeddings back to torch tensors
            query_embeds = torch.stack(query_embeds)
            question_embeds = torch.stack(question_embeds)
            answer_embeds = torch.stack(answer_embeds)
            print(query_embeds.shape)
            print(question_embeds.shape)
            print(answer_embeds.shape)

    elif FLAGS.embedder == "TFIDF":
        # initialize the vectorized
        if FLAGS.expanded:
            with open(f"{FLAGS.folder}TFIDF_vocab.p") as f:
                vocab = pkl.load(f)
            vectorizer = TfidfVectorizer(vocabulary=vocab)
        else:
            vectorizer = TfidfVectorizer()

        if FLAGS.expanded and FLAGS.negative_samples:
            # Extend the data with the negative samples
            for r, samples in zip(ranges, sampled_question_indices):
                queries.extend([queries[r[0]]] * len(samples))
                questions.extend([questions[i] for i in samples])
                answers.extend([answers[i] for i in samples])
                impression_lvls.extend([impression_lvls[i] for i in samples])
                engagement_lvls.extend([0] * len(samples))
                click_probs.extend([click_probs[i] for i in samples])

        # create the corpus: a list of string, each string is a data instance
        corpus = [
            " ".join([queries[i], questions[i], " ".join(answers[i])])
            for i in range(len(queries))
        ]

        # this yields a sparse vector
        X = vectorizer.fit_transform(corpus)
        if not FLAGS.expanded:
            with open(f"{FLAGS.folder}TFIDF_vocab.p", "wb") as f:
                pkl.dump(vectorizer.vocabulary_, f)

        # use code snippet from https://ray075hl.github.io/ray075hl.github.io/sparse_matrix_pytorch/ to convert to torch tensor
        X = X.tocoo().astype(np.float32)
        indices = torch.from_numpy(np.vstack((X.row, X.col))).long()
        values = torch.from_numpy(X.data)
        shape = torch.Size(X.shape)
        X = torch.sparse_coo_tensor(indices, values, shape)
        print(f"shape of X: {X.shape}")

    else:
        print(f"Embedder {FLAGS.embedder} does not exist")
        return

    # either return the dataset for regression, with only questios, queries
    # and answers, or return with all attributes
    if FLAGS.expanded:
        # TODO
        # if statement if TFIDF or BERT
        # load neural net and perform forward pass on the data, yielding the predicted engagement levels
        if FLAGS.embedder == "Bert":
            answer_embeds = answer_embeds.reshape(query_embeds.shape[0], -1)

            input_matrix = torch.cat(
                (query_embeds, question_embeds, answer_embeds), dim=1)

            nn = Regression(n_inputs=input_matrix.shape[1],
                            n_hidden=[300, 32],
                            dropout_percentages=[0.0, 0.0],
                            n_classes=1,
                            batchnorm=True)
            nn.load_state_dict(torch.load("Models/Best_regression_model.pt"))
            nn.eval()
            with torch.no_grad():
                preds = nn(input_matrix).squeeze()
        elif FLAGS.embedder == "TFIDF":
            nn = Regression(n_inputs=X.shape[1],
                            n_hidden=[300, 32],
                            dropout_percentages=[0.0, 0.0],
                            n_classes=1,
                            batchnorm=True)
            # TODO Correct model
            nn.load_state_dict(
                torch.load(
                    "Models/Regression_Bert_SGD_0.0001_1e-05_300, 32_0.0, 0.0_True_40.pt"
                ))
            nn.eval()
            with torch.no_grad():
                preds = nn(X).squeeze()

        # Save in Data object
        dataset = Data(queries, questions, answers, impression_lvls,
                       engagement_lvls, click_probs, preds)

        # save the dataloader
        with open(filename_dataset, "wb") as f:
            pkl.dump(dataset, f, protocol=4)
    # return the dataset for regression
    else:
        dataset = []
        if FLAGS.embedder == "Bert":
            for i, (query, question) in tqdm(
                    enumerate(zip(query_embeds, question_embeds))):

                # reshape answers
                answers = answer_embeds[i * 5:i * 5 + 5]
                answers = answers.reshape(-1)

                engagement_lvl = torch.Tensor([int(engagement_lvls[i])
                                               ]).float()

                inp = torch.cat((query, question, answers), 0)

                # Add the datapoint to the dataset
                dataset.append((inp, engagement_lvl))

        elif FLAGS.embedder == "TFIDF":
            for i, inp in enumerate(X):
                dataset.append(
                    (inp, torch.Tensor([int(engagement_lvls[i])]).float()))

        # save the dataloader
        with open(filename_dataset, "wb") as f:
            pkl.dump(dataset, f)
Ejemplo n.º 2
0
def train():
    """
    Performs training and evaluation of Regression model.
    """
    # Set the random seeds for reproducibility
    np.random.seed(10)
    torch.manual_seed(10)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Get number of units in each hidden layer
    if FLAGS.dnn_hidden_units:
        dnn_hidden_units = FLAGS.dnn_hidden_units.split(",")
        dnn_hidden_units = [
            int(dnn_hidden_unit_) for dnn_hidden_unit_ in dnn_hidden_units
        ]
    else:
        dnn_hidden_units = []

    # convert dropout percentages
    dropout_probs = [float(prob) for prob in FLAGS.dropout_probs.split(',')]

    # check if length of dropout is equal to nr of hidden layers
    if len(dropout_probs) != len(dnn_hidden_units):
        dropout_len = len(dropout_probs)
        hidden_len = len(dnn_hidden_units)
        if dropout_len < hidden_len:
            for _ in range(hidden_len - dropout_len):
                dropout_probs.append(0)
        else:
            dropout_probs = dropout_probs[:hidden_len]
    # use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device :", device)

    # extract all data and divide into train, valid and split dataloaders
    dataset_filename = f"dataset_filename=MIMICS-Click.tsv_expanded=False_balance=True_impression={FLAGS.impression}_reduced_classes={FLAGS.reduced_classes}_embedder={FLAGS.embedder}.p"
    with open(os.path.join(FLAGS.data_dir, dataset_filename), "rb") as f:
        dataset = pkl.load(f)

    len_all = len(dataset)

    train_len, valid_len = int(0.7 * len_all), int(0.15 * len_all)
    test_len = len_all - train_len - valid_len
    splits = [train_len, valid_len, test_len]
    train_data, valid_data, test_data = random_split(dataset, splits)

    train_dl = DataLoader(train_data,
                          batch_size=FLAGS.batch_size,
                          shuffle=True,
                          drop_last=True)
    valid_dl = DataLoader(valid_data,
                          batch_size=FLAGS.batch_size,
                          shuffle=True,
                          drop_last=True)
    test_dl = DataLoader(test_data,
                         batch_size=FLAGS.batch_size,
                         shuffle=True,
                         drop_last=True)

    with open(f"{FLAGS.data_dir}/test_dl.pt", "wb") as f:
        pkl.dump(test_dl, f)

    # initialize MLP and loss function
    input_size = iter(train_dl).next()[0].shape[1]  # 5376 for BERT embeddings
    nn = Regression(input_size, dnn_hidden_units, dropout_probs, 1,
                    FLAGS.neg_slope, FLAGS.batchnorm).to(device)
    loss_function = torch.nn.MSELoss()

    if FLAGS.verbose:
        print(f"neural net:\n {[param.data for param in nn.parameters()]}")

    # initialize optimizer
    if FLAGS.optimizer == "SGD":
        optimizer = torch.optim.SGD(nn.parameters(),
                                    lr=FLAGS.learning_rate,
                                    weight_decay=FLAGS.weightdecay,
                                    momentum=FLAGS.momentum)
    elif FLAGS.optimizer == "Adam":
        optimizer = torch.optim.Adam(nn.parameters(),
                                     lr=FLAGS.learning_rate,
                                     amsgrad=FLAGS.amsgrad,
                                     weight_decay=FLAGS.weightdecay)
    elif FLAGS.optimizer == "AdamW":
        optimizer = torch.optim.AdamW(nn.parameters(),
                                      lr=FLAGS.learning_rate,
                                      amsgrad=FLAGS.amsgrad,
                                      weight_decay=FLAGS.weightdecay)
    elif FLAGS.optimizer == "RMSprop":
        optimizer = torch.optim.RMSprop(nn.parameters(),
                                        lr=FLAGS.learning_rate,
                                        weight_decay=FLAGS.weightdecay,
                                        momentum=FLAGS.momentum)

    # initialization for plotting and metrics
    training_losses = []
    valid_losses = []

    initial_train_loss = eval_on_test(nn, loss_function, train_dl, device)
    training_losses.append(initial_train_loss)
    initial_valid_loss = eval_on_test(nn, loss_function, valid_dl, device)
    valid_losses.append(initial_valid_loss)

    # construct name for saving models and figures
    variables_string = f"regression_{FLAGS.embedder}_{FLAGS.impression}_{FLAGS.reduced_classes}_{FLAGS.optimizer}_{FLAGS.learning_rate}_{FLAGS.weightdecay}_{FLAGS.momentum}_{FLAGS.dnn_hidden_units}_{FLAGS.dropout_probs}_{FLAGS.batchnorm}_{FLAGS.nr_epochs}"

    overall_batch = 0
    min_valid_loss = 10000

    # training loop
    for epoch in range(FLAGS.nr_epochs):

        print(f"\nEpoch: {epoch}")

        for batch, (x, y) in enumerate(train_dl):
            nn.train()

            # squeeze the input, and put on device
            x = x.to(device)
            y = y.to(device)

            optimizer.zero_grad()

            # forward pass
            pred = nn(x).to(device)

            # compute loss and backpropagate
            loss = loss_function(pred, y)
            loss.backward()

            # update the weights
            optimizer.step()

            # save training loss
            training_losses.append(loss.item())

            # print(f"batch loss ({batch}): {loss.item()}")

            # get loss on validation set and evaluate
            if overall_batch % FLAGS.eval_freq == 0 and overall_batch != 0:
                valid_loss = eval_on_test(nn, loss_function, valid_dl, device)
                valid_losses.append(valid_loss)
                print(
                    f"Training loss: {loss.item()} / Valid loss: {valid_loss}")
                if valid_loss < min_valid_loss:
                    print(
                        f"Model is saved in epoch {epoch}, overall batch: {overall_batch}"
                    )
                    torch.save(nn.state_dict(),
                               f"Models/Regression_{variables_string}.pt")
                    min_valid_loss = valid_loss
                    optimal_batch = overall_batch

            overall_batch += 1

    # Load the optimal model (with loweest validation loss, and evaluate on test set)
    optimal_nn = Regression(input_size, dnn_hidden_units, dropout_probs, 1,
                            FLAGS.neg_slope, FLAGS.batchnorm).to(device)
    optimal_nn.load_state_dict(
        torch.load(f"Models/Regression_{variables_string}.pt"))

    test_loss, test_pred, test_true = eval_on_test(optimal_nn,
                                                   loss_function,
                                                   test_dl,
                                                   device,
                                                   verbose=FLAGS.verbose,
                                                   return_preds=True)

    # save the test predictions of the regressor
    with open(
            f"Predictions/regression_test_preds{FLAGS.embedder}_{FLAGS.reduced_classes}_{FLAGS.impression}.pt",
            "wb") as f:
        pkl.dump(test_pred, f)

    print(
        f"Loss on test set of optimal model (batch {optimal_batch}): {test_loss}"
    )

    significance_testing(test_pred, test_true, loss_function, FLAGS)

    if FLAGS.plotting:
        plotting(training_losses, valid_losses, test_loss, variables_string,
                 optimal_batch, FLAGS)