def val(epoch, validation_dataframe, embed_dict, dual_encoder, optimizer,
        loss_func, device):

    shuffle_dataframe(validation_dataframe)

    validation_correct_count = 0

    sum_loss_validation = 0.0

    dual_encoder.eval()

    for index, row in validation_dataframe.iterrows():
        if row["Label"] == 1:
            row = {
                "Context": row["Context"],
                "Utterance": row["Correct"],
                "Label": 1
            }
        else:
            row = {
                "Context": row["Context"],
                "Utterance": random.choice(list(row)[3:]),
                "Label": 0
            }

        context = get_embeddings(embed_dict, row['Context'])
        response = get_embeddings(embed_dict, row['Utterance'])
        label = float(row['Label'])

        context = autograd.Variable(torch.FloatTensor(context).view(
            len(context), 1, -1),
                                    requires_grad=False).to(device)
        response = autograd.Variable(torch.FloatTensor(response).view(
            len(response), 1, -1),
                                     requires_grad=False).to(device)

        label = autograd.Variable(
            torch.FloatTensor(torch.from_numpy(np.array(label).reshape(
                1, 1)))).to(device)
        score = dual_encoder(context, response)
        loss = loss_func(score, label)

        sum_loss_validation += loss.data

        validation_correct_count = increase_count(validation_correct_count,
                                                  torch.sigmoid(score), label)

    validation_accuracy = validation_correct_count / len(validation_dataframe)
    val_loss = sum_loss_validation / len(validation_dataframe)

    return validation_accuracy, val_loss
Esempio n. 2
0
 def setup_runtime(self, ref_samples):
     self.model.eval()
     self.metric_fc.eval()
     # create reference
     self.create_ref_dataset(ref_samples)
     self.ref_embs = get_embeddings(self.model, self.dl['ref'], self.device)
     self.logger.info('Calculated reference embeddings.')
Esempio n. 3
0
 def predict(self, test_samples, test_labels=None, return_raw=False):
     self.create_test_dataset(test_samples, ['good']*len(test_samples) if test_labels is None else test_labels)
     test_embs = get_embeddings(self.model, self.dl['test'], self.device, return_y=False)
     sample_distances = n_by_m_distances(test_embs, self.ref_embs)
     if return_raw:
         return sample_distances.min(axis=-1), sample_distances
     return sample_distances.min(axis=-1)
Esempio n. 4
0
def build_graph(config):

    word2idx, idx2word = get_vocabs(config['vocab_file'])
    embeddings = get_embeddings(word2idx, config['s2v_file'])

    weights = config.get('weights', [1 for _ in config['metrics']])
    assert len(config['metrics']) == len(weights)
    metrics = {m: {'weight': w} for m, w in zip(config['metrics'], weights)}

    if 'lm' in metrics:
        metrics['lm'].update(
            dict(forward=config['lm_save_dir'],
                 reverse=config.get('lm_rev_save_dir', None),
                 num_words=len(word2idx)))

    if 'cos' in metrics:
        idf_file = config.get('idf_file', None)
        if idf_file is not None:
            metrics['cos'].update(
                dict(idf=get_idf_vector(idf_file, word2idx),
                     embeddings=embeddings))
        else:
            metrics['cos'].update(dict(embeddings=embeddings))

    sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
    sess = tf.Session(config=sess_config)

    model_inputs, model_outputs = get_model(metrics, mode=config['mode'])

    if 'lm' in metrics:
        init_lm_checkpoints(metrics['lm'])
    sess.run(tf.global_variables_initializer())
    return sess, model_inputs, model_outputs, embeddings, word2idx, idx2word
def train(epoch, training_dataframe, embed_dict, dual_encoder, optimizer,
          loss_func, device):
    shuffle_dataframe(training_dataframe)
    sum_loss_training = 0.0
    training_correct_count = 0
    dual_encoder.train()

    for index, row in training_dataframe.iterrows():

        if row["Label"] == 1:
            row = {
                "Context": row["Context"],
                "Utterance": row["Correct"],
                "Label": 1
            }
        else:
            row = {
                "Context": row["Context"],
                "Utterance": random.choice(list(row)[3:]),
                "Label": 0
            }
        context = get_embeddings(embed_dict, row['Context'])
        response = get_embeddings(embed_dict, row['Utterance'])
        label = row['Label']
        label = np.array(label).astype(np.float32)

        context = autograd.Variable(torch.FloatTensor(context).view(
            len(context), 1, -1),
                                    requires_grad=False).to(device)
        response = autograd.Variable(torch.FloatTensor(response).view(
            len(response), 1, -1),
                                     requires_grad=False).to(device)
        label = autograd.Variable(torch.FloatTensor(
            torch.from_numpy(np.array(label).reshape(1, 1))),
                                  requires_grad=False).to(device)
        score = dual_encoder(context, response)
        loss = loss_func(score, label)
        sum_loss_training += loss.data
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        training_correct_count = increase_count(training_correct_count,
                                                torch.sigmoid(score), label)

    training_accuracy = training_correct_count / len(training_dataframe)
    train_loss = sum_loss_training / len(training_dataframe)
    return training_accuracy, train_loss
def update_database(in_path, image, name):

    with open(in_path, "rb") as pkl_in:
        database = pickle.load(pkl_in)

    embeddings_set, id_to_name = database

    if name:
        embeddings, _ = get_embeddings(image)
    else:
        embeddings, name = get_embeddings(image)

    if embeddings is not None:
        embeddings_set = torch.cat(
            (embeddings_set, embeddings.reshape(1, 1, -1)), dim=0)
        id_to_name[len(id_to_name)] = name

    database = [embeddings_set, id_to_name]

    with open(in_path, "wb") as pkl_out:
        pickle.dump(database, pkl_out)
Esempio n. 7
0
def checkImages():
    try:
        databaseName = request.args.get('databaseName')
        imageId = request.args.get('imageId')
        #selecting important information from the user request

        #Accuring the database images based on the request
        #img1
        #img2 or vector2
        face1 = get_face_image(img1)
        face2 = get_face_image(img2)
        #can be skipped by the inclusion of the vectors time saved - 1s
        vector1 = get_embeddings(face1)
        vector2 = get_embeddings(face2)
        #getting the distance (distance measure can be changed)
        distOfImages = calc_dist(vector1, vector2)
        #getting the match score of the image
        score = get_match_score(distOfImages)
        #passing this as a api response to the client
        return jsonify({"Score": score})
    except:
        return jsonify({'trace': traceback.format_exc()})
Esempio n. 8
0
def create_database(in_path, out_path):

    images_list = os.listdir(in_path)
    embeddings_set = torch.rand(len(images_list), 1, 512)
    id_to_name = {}
    for i, image in enumerate(images_list):
        embeddings, name = get_embeddings(os.path.join(in_path, image))
        if embeddings is not None:
            embeddings_set[i] = embeddings
            id_to_name[i] = name
    database = [embeddings_set, id_to_name]

    with open(out_path, "wb") as pkl_out:
        pickle.dump(database, pkl_out)
Esempio n. 9
0
    def findBest(self, utterance, options):
        """
        finds the best utterance out of all those given in options
        :param utterance: a single string
        :param options: a sequence of strings
        :return: returns one of the strings of options
        """
        self.dual_encoder.eval()
        device = torch.device('cuda:0' if torch.cuda.is_avaliable() else 'cpu')
        scores = []
        context_embed = get_embeddings(self.embed_dict, utterance)
        context = autograd.Variable(torch.FloatTensor(context_embed).view(
            len(context_embed), 1, -1),
                                    requires_grad=False).to(device)

        for answer in options:
            response = get_embeddings(self.embed_dict, answer)
            response = autograd.Variable(torch.FloatTensor(response).view(
                len(response), 1, -1),
                                         requires_grad=False).to(device)
            score = self.dual_encoder(context, response)
            scores.append(score)
        pred = np.argmax(scores)  # pick the answer with the highest score
        return options[pred]
Esempio n. 10
0
def prepare(args, config):
    word2idx, idx2word = get_vocabs(args.vocab_file)
    try:
        embeddings = get_embeddings(word2idx, args.w2v_file)
    except FileNotFoundError:
        logging.info(
            'embedding file not found. Train embeddings from scratch instead')
        embeddings = None
    with tf.variable_scope('LanguageModel'):
        model_inputs, model_outputs = get_model(config, embeddings,
                                                len(word2idx))

    sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
    sess = tf.Session(config=sess_config)
    sess.run(tf.global_variables_initializer())

    return word2idx, model_inputs, model_outputs, sess
Esempio n. 11
0
    def visualize_after_eval(self, values, test_files, test_labels, test_y_trues):
        auc, pauc, norm_threshs, norm_factor, scores, raw_scores = values

        self.logger.debug(f'# of test files: {len(test_files)}')
        self.logger.debug('distribution' + str(get_class_distribution(test_labels)))
        self.logger.info(f'AUC = {auc}')

        # get worst test info
        test_anomaly_idx = np.where(test_y_trues)[0]
        scores_anomaly = scores[test_anomaly_idx]
        worst_test_idxs = test_anomaly_idx[scores_anomaly.argsort()[:self.n_mosts]]
        worst_test_info = self.get_test_xx_most_info(worst_test_idxs,
                                                    raw_scores, self.ds['ref'], self.ds['test'])

        # visualize embeddings
        classes = sorted(list(set(test_labels)))
        classes = ['good'] + [l for l in classes if l != 'good']
        test_embs = get_embeddings(self.model, self.dl['test'], self.device, return_y=False)
        visualize_embeddings(title='Class embeddings distribution', embeddings=test_embs,
                            ys=[classes.index(label) for label in test_labels],
                            classes=classes)
        plt.show()

        # Best/Worst cases per class
        for cls in classes:
            test_mask = [label == cls for label in test_labels]
            test_idx = np.where(test_mask)[0]
            scores_cls = scores[test_mask]

            class_worst_test_idxs = test_idx[scores_cls.argsort()[:self.n_mosts]]
            worst_test_info = self.get_test_xx_most_info(class_worst_test_idxs,
                                                        raw_scores, self.ds['ref'], self.ds['test'])
            class_best_test_idxs  = test_idx[scores_cls.argsort()[::-1][:self.n_mosts]]
            best_test_info  = self.get_test_xx_most_info(class_best_test_idxs,
                                                        raw_scores, self.ds['ref'], self.ds['test'])
            if cls == 'good':
                worst_test_info, best_test_info = best_test_info, worst_test_info

            self.show_test_matching_images('Best: ' + cls, best_test_info)
            plt.show()

            self.show_test_matching_images('Worst: ' + cls, worst_test_info)
            plt.show()
Esempio n. 12
0
    def prepare_decoder(self, targets):
        """Prepares targets for transformer decoder."""
        shape = utils.shape_list(targets)
        # sequence should be [batch, seq_length]
        assert len(shape) == 2, 'Sequence tensors should be 2-dimensional'
        assert (len(self.hparams.query_shape) == 1
                ), 'query shape should be 1-dimensional'

        # Mask random positions
        if self.hparams.target_dropout:
            targets = tf.where(
                tf.random.uniform(shape) < self.hparams.target_dropout,
                tf.zeros_like(targets),
                targets,
            )
        # Shift positions
        targets = tf.expand_dims(targets, axis=-1)
        targets = utils.right_shift_blockwise_nd(targets,
                                                 self.hparams.query_shape)
        targets = tf.squeeze(targets, axis=-1)
        # Add token embeddings
        targets = utils.get_embeddings(
            targets=targets,
            hidden_size=self.hparams.embedding_dims,
            vocab_size=self.vocab_size,
        )
        if self.hparams.dropout:
            targets = tf.nn.dropout(targets, 1 - self.hparams.dropout)
        targets = tf.layers.dense(targets,
                                  self.hidden_size,
                                  activation=None,
                                  name='emb_dense')
        if self.hparams.add_timing_signal:
            targets += utils.get_timing_signal_1d(
                self.hparams.max_target_length, self.hidden_size)
        return targets
Esempio n. 13
0
def train(FLAGS):
    """
    Train our embeddings.
    """

    # Get data loaders
    print ("==> Reading and processing the data ... ", end="")
    train_loader, test_loader, num_unique_words, \
        num_unique_documents, word_to_idx = process_data(
            data_dir=FLAGS.data_dir,
            vocab_size=FLAGS.vocab_size,
            window_size=FLAGS.window_size,
            split_ratio=FLAGS.split_ratio,
            batch_size=FLAGS.batch_size,
            )
    print ("[COMPLETE]")

    # Load pretrained GloVe embeddings for our vocab
    embedding_dir = os.path.join(basedir, "../../../../embeddings/glove")
    embedding_dim = 100
    embeddings = get_embeddings(
        embedding_dir=embedding_dir,
        embedding_dim=embedding_dim,
        words=word_to_idx.keys(),
        )

    # Initialize model, criterion, loss
    print ("==> Initializing model components ... ", end="")
    model = MLP(
        D_in_words=num_unique_words,
        D_in_documents=num_unique_documents,
        embedding_dim=FLAGS.embedding_dim,
        num_hidden_units=FLAGS.num_hidden_units,
        window_size=FLAGS.window_size,
        embeddings=embeddings,
        )
    # Objective
    criterion = torch.nn.CrossEntropyLoss()
    # Optimizer
    # Only get the parameters with gradients (we freeze our GloVe embeddings)
    parameters = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=FLAGS.lr)
    print ("[COMPLETE]")

    # Train the model
    print ("==> Training the model ... [IN PROGRESS]")
    model = training_procedure(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        train_loader=train_loader,
        test_loader=test_loader,
        num_epochs=FLAGS.num_epochs,
        learning_rate=FLAGS.lr,
        decay_rate=FLAGS.decay_rate,
        max_grad_norm=FLAGS.max_grad_norm,
        log_every=FLAGS.log_every,
        )
    print ("\n[COMPLETE]")

    # Save the model
    print ("==> Saving the model ... [IN PROGRESS]")
    torch.save(model, os.path.join(basedir, FLAGS.data_dir, "model.pt"))
    print ("\n[COMPLETE]")
Esempio n. 14
0
def generate_emb():

    global all_data, best_embeddings, best_model, all_hyperparameters, curr_hyperparameters, best_hyperparameters, all_losses, device

    all_data = preprocess.read_graphs(str(config.DATASET_DIR /
                                          "edge_list.txt"))

    # Iterate through hyperparameter type (shuffled)
    shuffled_param_type = random.sample(all_hyperparameters.keys(),
                                        len(all_hyperparameters.keys()))
    for param_type in shuffled_param_type:

        # Iterate through hyperparameter values of the specified type (shuffled)
        shuffled_param_val = random.sample(
            all_hyperparameters[param_type],
            len(all_hyperparameters[param_type]))
        for param_val in shuffled_param_val:

            # Initiate current hyperparameter
            curr_hyperparameters[param_type] = param_val
            print(curr_hyperparameters)
            log_f.write(str(curr_hyperparameters) + "\n")

            # Set up
            model = mdl.TrainNet(all_data.x.shape[1],
                                 curr_hyperparameters['hidden'],
                                 curr_hyperparameters['output'],
                                 config.CONV.lower().split("_")[0],
                                 curr_hyperparameters['dropout']).to(device)
            optimizer = torch.optim.Adam(
                model.parameters(),
                lr=curr_hyperparameters['lr'],
                weight_decay=curr_hyperparameters['wd'])

            # Train model
            model.train()
            curr_losses = []
            for epoch in range(config.EPOCHS):
                loss = train(epoch, model, optimizer)
                curr_losses.append(loss)
            all_losses[";".join(
                [str(v) for v in curr_hyperparameters.values()])] = curr_losses

            # Set up for next hyperparameter
            curr_hyperparameters[param_type] = best_hyperparameters[param_type]

    print("Best Hyperparameters: ", best_hyperparameters)
    print("Optimization finished!")
    log_f.write("Best Hyperparameters: %s \n" % best_hyperparameters)

    # Save best embeddings
    device = torch.device('cpu')
    best_model = best_model.to(device)
    best_embeddings = utils.get_embeddings(best_model, all_data, device)

    # Test
    test(best_model)

    # Save best embeddings
    torch.save(best_embeddings,
               config.DATASET_DIR / (config.CONV.lower() + "_embeddings.pth"))
Esempio n. 15
0
    'RBR': 39,
    'POS': 40,
    'PDT': 41,
    'UH': 42,
    'WP': 43,
    'JJ|NN': 44,
    'AFX': 45
}

dl = Data_Load('train', pipeline, dep_type2id, pos_tag2id)
matrix, word2id, id2word, X_dummy, Y, files_len_list = dl.get_X_Y(int(doc_num))

#matrix=torch.tensor(matrix,dtype=torch.long)
#X_dummy=torch.tensor(X_dummy,dtype=torch.float)
print("load embedding...")
embeddings_matrix = utils.get_embeddings(
    "gensim_glove_vectors" + embed_dim + "d.txt", word2id, int(embed_dim))
print("finally finish loading!!!")

START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = embeddings_matrix.shape[1] * matrix.shape[1] + X_dummy.shape[1]
HIDDEN_DIM = 400

labels = list(set(Y))
not_removed_label = list(set(Y))
print(labels)
tag_to_ix = dict(zip(labels, range(len(set(Y)))))
id_to_tag = dict(zip(range(len(set(Y))), labels))
tag_to_ix[START_TAG] = len(tag_to_ix)
tag_to_ix[STOP_TAG] = len(tag_to_ix)
    def get_X_Y(self, max_num=10000):
        files = os.listdir(self.tag_path)
        X = []
        X_catg = []
        X_pos = []
        X_pre = []
        Y = []
        dep_type2id = {}
        pos_tag2id = {}
        i = 0
        #files=['protocol_615.txt']
        for f in files:
            if i == max_num:
                break
            i += 1
            print(f)
            no_affix_f = f[:f.index(".txt")]
            sent_pos_list = get_sent_pos_list(self.tag_path + f)
            line_list = get_line_list(self.original_path + f)
            ngram_lemmas_syn_list = get_ngram_lemmas_syn(
                sent_pos_list, self.nlp)

            label_list = get_labels(self.original_path + no_affix_f + ".ann")
            doc_list = get_line_list(self.original_path + f)

            len_list = [0] * len(doc_list)

            for j in range(1, len(doc_list)):
                len_list[j] = len_list[j - 1] + len(doc_list[j - 1])
            len_list.extend([1000000])
            len_label_list = [[] for i in range(len(doc_list))]
            one_len = []
            flag = 1

            for entityidx, info, words in label_list:
                #print(info)
                info = info.split(";")[0]
                clas, start, end = info.split(" ")
                slot, num = first(len_list, lambda x: int(start) < x)
                len_label_list[slot - 1].append((words, clas))
                """if int(start)<len_list[flag]:
                    one_len.append((words,clas))
                else:
                    flag+=1
                    len_label_list.append(one_len)
                    one_len=[(words,clas)]
                if len(len_label_list)<len(doc_list):
                    len_label_list.append([])    """

            for idx, sent_list in enumerate(ngram_lemmas_syn_list):
                #['<START>', '<PAD>', '<PAD>', '<PAD>'], ['SpinSmart', 'NNP', 'SpinSmart'], ['Plasmid', 'plasmid_DNA', '<PAD>', '<PAD>'], ['<PAD>', '<PAD>'], ['Plasmid', 'compound']), (['SpinSmart', '<PAD>', '<PAD>', '<PAD>'], ['Plasmid', 'NNP', 'Plasmid'], ['Purification', 'refining', 'refinement', 'purgation'], ['compound', 'SpinSmart'], ['Purification', 'compound']
                for word_feat in sent_list:
                    if word_feat[3][0] not in dep_type2id.keys():
                        dep_type2id[word_feat[3][0]] = len(dep_type2id)
                    if word_feat[4][1] not in dep_type2id.keys():
                        dep_type2id[word_feat[4][1]] = len(dep_type2id)
                    if word_feat[1][1] not in pos_tag2id.keys():
                        pos_tag2id[word_feat[1][1]] = len(pos_tag2id)

                    if word_feat[1][0] == 'a':
                        Y.append('o')
                    else:
                        Y.append(
                            first(len_label_list[idx],
                                  lambda x: x[0].find(word_feat[1][0]) >= 0)[1]
                            [1])

                    x_feat = []
                    x_feat.extend(word_feat[0])
                    x_feat.extend([word_feat[1][0], word_feat[1][2]])
                    x_feat.extend(word_feat[2])
                    x_feat.extend([word_feat[3][1]])
                    x_feat.extend([word_feat[4][0]])

                    x_feat.extend([word_feat[1][1]])
                    x_feat.extend([word_feat[3][0]])
                    x_feat.extend([word_feat[4][1]])

                    X_pre.append(x_feat)

        for x_feat in X_pre:
            cat_g = []
            deprel = [0] * len(dep_type2id)
            deprel[dep_type2id[x_feat[-2]]] = 1
            govrel = [0] * len(dep_type2id)
            govrel[dep_type2id[x_feat[-1]]] = 1
            cat_g.extend(deprel)
            cat_g.extend(govrel)

            pos_feat = [0] * len(pos_tag2id)
            pos_feat[pos_tag2id[x_feat[-3]]] = 1

            x_feat = x_feat[:-3]

            X.append(x_feat)
            X_catg.append(cat_g)
            X_pos.append(pos_feat)

        X_catg = np.array(X_catg)

        X_pos = np.array(X_pos)

        vocab = get_word_vocab.Vocab(X)
        word2id = vocab.get_word2id()
        matrix = vocab.get_matrix()

        print("load embedding...")
        embeddings_matrix = get_embeddings(self.glove_path, word2id,
                                           self.emb_dim)
        print("finally finish loading!!!")
        embedding = nn.Embedding(embeddings_matrix.shape[0],
                                 embeddings_matrix.shape[1])
        embedding.weight = nn.Parameter(embeddings_matrix)
        embedding.weight.requires_grad = False
        embedding_X = np.array(embedding(torch.LongTensor(matrix)))
        embedding_X = embedding_X.reshape(
            embedding_X.shape[0], embedding_X.shape[1] * embedding_X.shape[2])
        #print(embedding_X.shape)

        final_X = np.concatenate((embedding_X, X_catg, X_pos), axis=1)
        print(final_X.shape)

        Y = np.array(Y)
        print(Y.shape)
        return final_X, Y
def main():
    print("Starting x-ray fine-tuning script at %s" % (str(datetime.datetime.now())))

    assert args.split >= 0 and args.split <= 9, "Split number can only be in [0,9]."
    split_idx = args.split

    ## Ensure output directory exists
    if os.path.exists(args.output_dir):
        pass
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    #-------------------
    # Load data, setup device
    #-------------------
    all_embeddings = utils.get_embeddings("covidx", args.mask, args.model)
    all_task_labels = utils.get_task_labels("covidx")
    all_domain_labels = utils.get_domain_labels("covidx")


    #-------------------
    # Generate splits with same random numbers
    #-------------------
    np.random.seed(args.repetition)
    torch.manual_seed(args.repetition)

    train_splits = []
    val_splits = []
    test_splits = []

    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=args.repetition)
    for i, (train_index, test_index) in enumerate(kf.split(all_embeddings, all_domain_labels)):
        
        train_index, val_index = train_test_split(train_index, test_size=0.1, stratify=all_domain_labels[train_index], random_state=args.repetition)

        train_splits.append(train_index)
        val_splits.append(val_index)
        test_splits.append(test_index)


    for i in range(len(train_splits)):
        assert len(np.unique(all_task_labels[train_splits[i]])) == 3
        assert len(np.unique(all_task_labels[val_splits[i]])) == 3
        assert len(np.unique(all_task_labels[test_splits[i]])) == 3

        assert len(np.unique(all_domain_labels[train_splits[i]])) == 5
        assert len(np.unique(all_domain_labels[val_splits[i]])) == 5
        assert len(np.unique(all_domain_labels[test_splits[i]])) == 5

    #-------------------
    # Setup datasets/dataloaders
    #-------------------
    scaler = StandardScaler()
    scaler = scaler.fit(all_embeddings[train_splits[split_idx]])
    all_embeddings = scaler.transform(all_embeddings)

    train_dataset = training_utils.EmbeddingMultiTaskDataset(
        all_embeddings[train_splits[split_idx]],
        [all_task_labels[train_splits[split_idx]], all_domain_labels[train_splits[split_idx]]],
    )
    val_dataset = training_utils.EmbeddingMultiTaskDataset(
        all_embeddings[val_splits[split_idx]],
        [all_task_labels[val_splits[split_idx]], all_domain_labels[val_splits[split_idx]]],
    )
    test_dataset = training_utils.EmbeddingMultiTaskDataset(
        all_embeddings[test_splits[split_idx]],
        [all_task_labels[test_splits[split_idx]], all_domain_labels[test_splits[split_idx]]]
    )


    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=True,
        batch_size=args.batch_size,
        num_workers=args.num_dataloader_workers,
        pin_memory=True,
    )
    train_unshuffled_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=False,
        batch_size=args.batch_size,
        num_workers=args.num_dataloader_workers,
        pin_memory=True,
    )
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        num_workers=1,
        pin_memory=True,
    )
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=args.batch_size,
        num_workers=1,
        pin_memory=True,
    )


    #-------------------
    # Training
    #-------------------
    mlp = training_utils.MultiTaskMLP(utils.get_model_embedding_sizes(args.model), args.hidden_layer_size, [3,5])
    mlp = mlp.to(device)
    optimizer = optim.AdamW(mlp.parameters(), lr=1e-3, amsgrad=True)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min")

    criterions = [
        nn.CrossEntropyLoss(), # task loss
    ]
    if args.use_feature_disentanglement:
        criterions.append(nn.CrossEntropyLoss()) # domain loss

    # Losses
    training_task_losses = []
    training_domain_losses = []
    validation_task_losses = []
    validation_domain_losses = []

    # AUCS
    validation_task_aucs = []
    validation_domain_aucs = []
    validation_task_accs = []
    validation_domain_accs = []

    # Other
    model_checkpoints = []

    num_times_lr_dropped = 0
    for epoch in range(args.num_epochs):
        lr = training_utils.get_lr(optimizer)
        
        task_lr = args.lr_start / (1+((args.lr_start-1)*(epoch/(args.num_epochs-1))**args.lr_exponent))
        
        training_losses = training_utils.fit(
            mlp,
            device,
            train_dataloader,
            optimizer,
            criterions,
            epoch,
            task_lr_multiplier=task_lr
        )
        
        validation_losses = training_utils.evaluate(
            mlp,
            device,
            val_dataloader,
            criterions,
            epoch
        )
        
        ## Record training/validation metrics
        training_task_losses.append(training_losses[0])
        validation_task_losses.append(validation_losses[0])
        if len(training_losses) > 1:
            training_domain_losses.append(training_losses[1])
            validation_domain_losses.append(validation_losses[1])

        ## Embed the entire training and validation sets
        train_set_embedding = training_utils.embed(mlp, device, train_unshuffled_dataloader)
        val_set_embedding = training_utils.embed(mlp, device, val_dataloader)
        
        ## Fit near optimal LR model on task labels
        task_lr_model = LogisticRegression(C=0.001, max_iter=20, random_state=args.repetition)
        task_lr_model.fit(train_set_embedding, all_task_labels[train_splits[split_idx]])
        y_pred_proba = task_lr_model.predict_proba(val_set_embedding)
        validation_task_auc = roc_auc_score(all_task_labels[val_splits[split_idx]], y_pred_proba, average="macro", multi_class="ovr")
        validation_task_acc = np.mean(utils.get_per_class_accuracies(all_task_labels[val_splits[split_idx]], y_pred_proba.argmax(axis=1)))
        print("Learned val task AUC:", validation_task_auc)
        validation_task_aucs.append(validation_task_auc)
        validation_task_accs.append(validation_task_acc)
        
        ## Fit near optimal LR model on domain labels 
        domain_lr_model = LogisticRegression(C=0.001, max_iter=20, random_state=args.repetition)
        domain_lr_model.fit(train_set_embedding, all_domain_labels[train_splits[split_idx]])
        y_pred_proba = domain_lr_model.predict_proba(val_set_embedding)
        validation_domain_auc = roc_auc_score(all_domain_labels[val_splits[split_idx]], y_pred_proba, average="macro", multi_class="ovr")
        validation_domain_acc = np.mean(utils.get_per_class_accuracies(all_domain_labels[val_splits[split_idx]], y_pred_proba.argmax(axis=1)))
        print("Learned val dataset AUC:", validation_domain_auc)
        validation_domain_aucs.append(validation_domain_auc)
        validation_domain_accs.append(validation_domain_acc)
        
        ## Copy near optimal LR model to model
        mlp.heads[1].weight.data = torch.from_numpy(domain_lr_model.coef_.astype(np.float32)).to(device)
        mlp.heads[1].bias.data = torch.from_numpy(domain_lr_model.intercept_.astype(np.float32)).to(device)
        
        model_checkpoints.append(copy.deepcopy(mlp.state_dict()))

        ## Early stopping
        scheduler.step(validation_losses[0])
        if training_utils.get_lr(optimizer) < lr:
            num_times_lr_dropped += 1
            print("")
            print("Learning rate dropped")
            print("")
        
        if num_times_lr_dropped == 3:
            break


    #-------------------
    # Testing
    #-------------------

    # Select best model
    if args.use_feature_disentanglement:
        best_model_checkpoint = model_checkpoints[np.argmin(validation_domain_aucs)]
    else:
        best_model_checkpoint = model_checkpoints[np.argmax(validation_task_aucs)]
    mlp.load_state_dict(best_model_checkpoint)


    # Evaluate on test tests
    y_pred_proba = training_utils.score(mlp, device, test_dataloader, 0)
    test_task_auc = roc_auc_score(all_task_labels[test_splits[split_idx]], y_pred_proba, average="macro", multi_class="ovr")
    test_task_acc = np.mean(utils.get_per_class_accuracies(all_task_labels[test_splits[split_idx]], y_pred_proba.argmax(axis=1)))
    print("Test task AUC:", test_task_auc)
    print("Test task ACC:", test_task_acc)
    print("")

    y_pred_proba = training_utils.score(mlp, device, test_dataloader, 1)
    test_domain_auc = roc_auc_score(all_domain_labels[test_splits[split_idx]], y_pred_proba, average="macro", multi_class="ovr")
    test_domain_acc = np.mean(utils.get_per_class_accuracies(all_domain_labels[test_splits[split_idx]], y_pred_proba.argmax(axis=1)))
    print("Test domain AUC:", test_domain_auc)
    print("Test domain ACC:", test_domain_acc)
    print("")


    # Save embeddings if we want to make UMAPs
    if args.save_embeddings:
        train_embedding = training_utils.embed(mlp, device, train_unshuffled_dataloader)
        val_embedding = training_utils.embed(mlp, device, val_dataloader)
        test_embedding = training_utils.embed(mlp, device, test_dataloader)

        all_embeddings = np.concatenate([
            train_embedding,
            val_embedding,
            test_embedding
        ], axis=0)


    #-------------------
    # Save everything
    #-------------------
    save_obj = {
        'args': args,
        'training_task_losses': training_task_losses,
        'training_domain_losses': training_domain_losses,
        'validation_task_losses': validation_task_losses,
        'validation_domain_losses': validation_domain_losses,
        'validation_task_aucs': validation_task_aucs,
        'validation_task_accs': validation_task_accs,
        'validation_domain_aucs':validation_domain_aucs,
        'validation_domain_accs': validation_domain_accs,
        "test_task_auc": test_task_auc,
        "test_task_acc": test_task_acc,
        "test_domain_auc": test_domain_auc,
        "test_domain_acc": test_domain_acc,
        "checkpoints": model_checkpoints
    }
    save_obj_fn = "covidx_%s_%s_%s_split-%d_%d_lr-%0.1f_hls-%d.pkl" % (args.mask, args.model, "disentangle" if args.use_feature_disentanglement else "no-disentangle",  args.split, args.repetition, args.lr_start, args.hidden_layer_size)
    with open(os.path.join(args.output_dir, save_obj_fn), 'wb') as f:
        pickle.dump(save_obj, f)

    if args.save_embeddings:
        save_embedding_fn = "covidx_%s_%s_%s_split-%d_%d_lr-%0.1f_hls-%d.npy" % (args.mask, args.model, "disentangle" if args.use_feature_disentanglement else "no-disentangle",  args.split, args.repetition, args.lr_start, args.hidden_layer_size)
        np.save(os.path.join(args.output_dir, save_embedding_fn), all_embeddings)
Esempio n. 18
0
def encode_mean_embeddings(string, embedding_map):
    embeddings = utils.get_embeddings(string, embedding_map)
    encoded = np.mean(embeddings, axis=0)
    return Variable(torch.FloatTensor(encoded))
            y_test, y_pred_proba.argmax(axis=1), number_classes=number_of_classes
        ))
        aucs.append(utils.get_binary_aucs(y_test, y_pred_proba, number_classes=number_of_classes))
        
    accs = np.array(accs)
    aucs = np.array(aucs)
    average_aucs = np.mean(aucs, axis=1)
    average_accs = np.mean(accs, axis=1)
    results = {
        "average auc": (np.mean(average_aucs), np.std(average_aucs)),
        "average acc": (np.mean(average_accs), np.std(average_accs)),
    }
    return results


models = ["xrv", "histogram", "densenet", "covidnet"]
masks = ["masked", "unmasked"]

print("Linear model performance discriminating between sub-datasets in the COVIDx dataset from pre-trained embeddings")
print("")
print("Masking method,Feature extractor model,Average AUC,Average ACC")
for mask in masks:
    for model in models:
        embeddings = utils.get_embeddings("covidx", mask, model)
        labels = utils.get_domain_labels("covidx")
        results = do_experiment(embeddings, labels, number_of_classes=5)
        print("%s,%s,%0.2f +/- %0.2f,%0.2f +/- %0.2f" % (
            mask, model,
            results["average auc"][0], results["average auc"][1],
            results["average acc"][0], results["average acc"][1],
        ))
Esempio n. 20
0
def main():
    print("Starting x-ray inference script at %s" %
          (str(datetime.datetime.now())))

    # Parse out model arguments from the model filename -- we guarantee that our saved models will look like this
    assert os.path.exists(args.model_fn), "Model file does not exist"
    model_parts = parse(
        "covidx_{mask}_{model}_{disentangle}_split-{split}_{repetition}_lr-50.0_hls-64.pkl",
        os.path.basename(args.model_fn))
    masked = model_parts["mask"] == "masked"
    base_model = model_parts["model"]
    assert base_model in ["xrv", "densenet"]
    disentangled = model_parts["disentangle"] == "disentangle"

    metadata_df = pd.read_csv(args.input_fn)
    if masked:
        original_fns = metadata_df["masked_image_path"].values
    else:
        original_fns = metadata_df["unmasked_image_path"].values
    num_samples = original_fns.shape[0]

    ## Ensure all input files exist
    for fn in original_fns:
        assert os.path.exists(fn), "Input doesn't exist: %s" % (fn)
        file_extension = fn.split(".")[-1]
        assert file_extension.lower(
        ) in ALLOWED_FILENAMES, "Input does not have a correct file extension: %s" % (
            file_extension)

    ## Ensure output directory exists
    output_dir = os.path.dirname(args.output_fn)
    if os.path.exists(output_dir):
        if os.path.exists(args.output_fn):
            if not args.overwrite:
                print("WARNING: The output file exists, exiting...")
                return
    else:
        os.makedirs(output_dir, exist_ok=True)

    ## Embed images with whatever the base model is
    tic = float(time.time())
    images = utils.get_images(
        original_fns
    )  # these will be the masked versions if we are using a masked model
    print("Finished loading images in %0.4f seconds" % (time.time() - tic))

    tic = float(time.time())
    if masked:
        images = utils.transform_to_equalized(images)

    if base_model == "xrv":
        images = utils.transform_to_xrv(images)
        xrv_model = utils.get_xrv_model(device)

        embeddings = utils.run_densenet_model(xrv_model,
                                              device,
                                              images,
                                              global_max_pool=False,
                                              embedding_size=1024,
                                              batch_size=64)
    elif base_model == "densenet":
        images = utils.transform_to_standardized(images)
        densenet_model = utils.get_densenet121(device)

        embeddings = utils.run_densenet_model(densenet_model,
                                              device,
                                              images,
                                              global_max_pool=False,
                                              embedding_size=1024,
                                              batch_size=64)
    else:
        raise ValueError("Not implemented yet")

    ## Adjusting for normalization
    repetition = int(model_parts["repetition"])
    split_idx = int(model_parts["split"])
    all_embeddings = utils.get_embeddings("covidx", model_parts["mask"],
                                          base_model)
    all_task_labels = utils.get_task_labels("covidx")
    all_domain_labels = utils.get_domain_labels("covidx")
    np.random.seed(repetition)
    torch.manual_seed(repetition)
    train_splits = []
    val_splits = []
    test_splits = []
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=repetition)
    for i, (train_index, test_index) in enumerate(
            kf.split(all_embeddings, all_domain_labels)):
        train_index, val_index = train_test_split(
            train_index,
            test_size=0.1,
            stratify=all_domain_labels[train_index],
            random_state=repetition)
        train_splits.append(train_index)
        val_splits.append(val_index)
        test_splits.append(test_index)
    scaler = StandardScaler()
    scaler = scaler.fit(all_embeddings[train_splits[split_idx]])
    embeddings = scaler.transform(embeddings)

    test_dataset = training_utils.EmbeddingMultiTaskDataset(
        embeddings, [np.zeros(embeddings.shape[0])])
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=64,
        num_workers=1,
        pin_memory=True,
    )
    print("Finished embedding images in %0.4f seconds" % (time.time() - tic))

    ## Run embeddings through the saved model
    tic = float(time.time())
    with open(args.model_fn, "rb") as f:
        saved_model_params = pickle.load(f)

    mlp = training_utils.MultiTaskMLP(
        utils.get_model_embedding_sizes(base_model),
        saved_model_params["args"].hidden_layer_size, [3, 5])
    mlp = mlp.to(device)

    if disentangled:
        best_model_checkpoint = saved_model_params["checkpoints"][np.argmin(
            saved_model_params["validation_domain_aucs"])]
    else:
        best_model_checkpoint = saved_model_params["checkpoints"][np.argmax(
            saved_model_params["validation_task_aucs"])]
    mlp.load_state_dict(best_model_checkpoint)

    test_set_pred_proba = training_utils.score(mlp, device, test_dataloader, 0)
    test_set_embedding = training_utils.embed(mlp, device, test_dataloader)
    print("Finished loading/running saved model in %0.4f seconds" %
          (time.time() - tic))

    # save output
    output_fn = args.output_fn
    if output_fn.endswith(".npy"):
        pass
    else:
        output_fn += ".npy"

    np.save(output_fn, test_set_pred_proba)

    if args.save_embeddings:
        output_fn = output_fn.replace(".npy", "_embeddings.npy")
        np.save(output_fn, test_set_embedding)
Esempio n. 21
0
            num_const += 1
        else:
            # var = tf.Variable(init(shape=[dim]))
            # embed_list.append(var)
            embed_list.append(embed_list[2])
            num_vars += 1
    print num_const, num_vars
    return tf.stack(embed_list, axis=0)


if __name__ == '__main__':
    assert sys.argv[1] in models.keys()
    print 'using model', sys.argv[1]

    print 'loading data'
    start = time()
    trainset, dev, test, vocab = utils.get_datasets(batch_size=BATCH_SIZE,
                                                    num_words=VOCAB_SIZE,
                                                    seq_len=SEQ_LEN)
    print 'took', time() - start, 'seconds'
    start = time()
    print 'getting embeddings'
    embeddings = utils.get_embeddings(vocab, './glove.6B/glove.6B.300d.txt')
    print 'took', time() - start, 'seconds'
    print 'initializing embeddings'
    start = time()
    embeddings = init_embeddings(embeddings, vocab, 300)
    print 'took', time() - start, 'seconds'
    print 'begin training'
    train(vocab, embeddings, trainset, dev, test)
    def build_model(self, word_index, use_skipgram=True):
        """Return a compiled Keras model for sentence classification.

        Parameters
        ----------
        word_index : List of tokens in input data
        use_skipgram : Boolean, whether to use fasttext skipgram word vectors.
            If false, cbow model word vectors will be used instead.

        Returns
        -------
        model : A compiled Keras model for predicting six types of toxicity
            in a sentencee.
        attention_layer_model : A Keras model for extracting the attention
            layer output.

        """
        # conv_filters_1 = 64
        # conv_filters_2 = 128
        gru_units = [96, 96, 96]
        dense_units = [64]

        dropout_prob = 0.3

        model_input = Input(shape=(self.num_timesteps, ), name='model_input')
        embedding_matrix = get_embeddings(word_index=word_index,
                                          embedding_dim=self.embedding_dim,
                                          use_ft_embeddings=self.use_ft,
                                          use_skipgram=use_skipgram)
        x = Embedding(
            len(word_index) + 1,  # +1 for 0 padding token
            self.embedding_dim,
            weights=[embedding_matrix],
            input_length=self.num_timesteps,
            trainable=False)(model_input)
        '''
        x = Conv1D(filters=conv_filters_1,
                   kernel_size=3,
                   padding='same',
                   activation='elu')(x)
        x = BatchNormalization()(x)
        x = SpatialDropout1D(0.3)(x)

        x = Conv1D(filters=conv_filters_2,
                   kernel_size=3,
                   padding='same',
                   activation='elu')(x)
        x = BatchNormalization()(x)
        x = SpatialDropout1D(0.3)(x)
        '''

        for n in range(len(gru_units)):
            x = SpatialDropout1D(dropout_prob)(x)
            x = Bidirectional(
                CuDNNGRU(units=gru_units[n], return_sequences=True))(x)
            x = BatchNormalization()(x)
            x = TimeDistributed(Activation('tanh'))(x)

        x = SpatialDropout1D(dropout_prob)(x)
        attention = self._attention_3d_block(inputs=x)
        dense_input = GlobalMaxPool1D()(attention)

        if self.use_aux_input:
            aux_input = Input(shape=(3, ), name='aux_input')
            dense_input = concatenate([dense_input, aux_input])

        for n in range(len(dense_units)):
            dense = Dropout(dropout_prob)(dense_input)
            dense = Dense(dense_units[n], activation=None)(dense)
            dense = BatchNormalization()(dense)
            dense = Activation('elu')(dense)

        dense = Dropout(dropout_prob)(dense)
        probs = Dense(6, activation='sigmoid')(dense)

        if self.use_aux_input:
            self.model = Model(inputs=[model_input, aux_input], output=probs)
        else:
            self.model = Model(inputs=model_input, output=probs)

        self.model.compile(loss='binary_crossentropy',
                           optimizer='rmsprop',
                           metrics=['accuracy'])
        print(self.last_attention_layer_name)
        self.attention_layer_model = Model(
            inputs=self.model.input,
            outputs=self.model.get_layer(
                self.last_attention_layer_name).output)
def architecture(
    params, 
    mode, 
    context,
    context_len,
    utterance,
    utterance_len,
    ):
    """Return the output operation following the network architecture.
    Returns:
         Logits output Op for the network.
    """
    #if mode == TRAIN:
    #    print('TRAIN:::::')
    #    print('context: ',context)
    #    print('context_len: ',context_len)
    #    print('utterance: ',utterance)
    #    print('utterance_len: ',utterance_len)
    #elif mode == EVAL:
    #    print('EVAL:::::')
    #    print('context: ',context)
    #    print('context_len: ',context_len)
    #    print('utterance: ',utterance)
    #    print('utterance_len: ',utterance_len)

    # Initialize embedidngs randomly or with pre-trained vectors if available
    embeddings_W = get_embeddings(params)
  
    #context = tf.Print(context,[context], '############ context ##########: ')
    #utterance = tf.Print(utterance,[utterance],"utterance: ")
    # Embed the context and the utterance
    context_embedded = tf.nn.embedding_lookup(
        embeddings_W, context, name="embed_context")
  
    if mode != PREDICT:
        utterance_embedded = tf.nn.embedding_lookup(
            embeddings_W, utterance, name="embed_utterance")
    #print('context_embedded: ',context_embedded)
    #print('utterance_embedded: ',utterance_embedded)
  
    with tf.variable_scope("rnn") as vs:
        # We use an LSTM Cell
        #cell = tf.nn.rnn_cell.LSTMCell(
        #             params.rnn_dim,
        #             forget_bias=2.0,
        #             use_peepholes=True)

        rnn_dims = params.rnn_dim.split(',')
        cell = [ tf.nn.rnn_cell.LSTMCell(
                     int(rnn_dim),
                     forget_bias=2.0,
                     use_peepholes=True) for rnn_dim in rnn_dims]
        cell = tf.nn.rnn_cell.MultiRNNCell(cell) 
        cell = tf.nn.rnn_cell.DropoutWrapper(
            cell,
            input_keep_prob=FLAGS.input_keep_prob,
            output_keep_prob=FLAGS.output_keep_prob,
            state_keep_prob=FLAGS.state_keep_prob)
        # Run the utterance and context through the RNN
        #context_len = tf.Print(context_len,[context_len],"context_len: ")
        #utterance_len = tf.Print(utterance_len, [utterance_len], 'utterance_len: ')
        if mode != PREDICT:
            tmp_concat = tf.concat([context_embedded, utterance_embedded],0)
            tmp_concat_len = tf.concat([context_len, utterance_len],0)
        else:
            tmp_concat = context_embedded
            tmp_concat_len = context_len
        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
            cell,
            tmp_concat,
            tmp_concat_len,
            dtype=tf.float32)
        if isinstance(rnn_states,list) or isinstance(rnn_states,tuple):
            rnn_states = rnn_states[0]
        #context_embedded:  Tensor("embed_context:0", shape=(64, 160, 300), dtype=float32)
        #utterance_embedded:  Tensor("embed_utterance:0", shape=(64, 160, 300), dtype=float32)
        #tf.concat([context_embedded, utterance_embedded]:  Tensor("concat:0", shape=(128, 160, 300), dtype=float32)
        #tf.concat([context_len, utterance_len]:  Tensor("concat_1:0", shape=(128,), dtype=int64)
        if mode != PREDICT:
            encoding_context, encoding_utterance = tf.split(rnn_states.h,2,0)
        else:
            encoding_context = rnn_states.h
  
    with tf.variable_scope("prediction") as vs:
        M = tf.get_variable("M",
          shape=[FLAGS.last_rnn_dim, FLAGS.last_rnn_dim],
          initializer=tf.truncated_normal_initializer())
    
        # "Predict" a  response: c * M
        generated_response = tf.matmul(encoding_context, M)
        print('matmul_weight: ',params.matmul_weight)
        if mode == PREDICT and not params.matmul_weight:
            return generated_response
        generated_response = tf.expand_dims(generated_response, 2)
        if mode == PREDICT and params.matmul_weight:
            return generated_response
        encoding_utterance = tf.expand_dims(encoding_utterance, 2)
    
        # Dot product between generated response and actual response
        # (c * M) * r
        #logits = tf.batch_matmul(generated_response, encoding_utterance, True)
        logits = tf.matmul(generated_response, encoding_utterance, True)
        logits = tf.squeeze(logits, [2])
        return logits
Esempio n. 24
0
 def get_embeddings(title, body):
     return utils.get_embeddings(title), utils.get_embeddings(body)
Esempio n. 25
0
def run(config, model_name):
    config = load_yaml(config)
    if model_name not in config['model']:
        raise NotImplementedError("{} is not implemented. ".format(model_name))
    preprocessing_params = config['preprocessing']
    training_params = config['training']
    model_params = config['model'][model_name]
    train_df = pd.read_csv(preprocessing_params['train_path'], sep='\t')
    test_df = pd.read_csv(preprocessing_params['test_path'], sep='\t')
    t_list = preprocessing_params['target_list']
    model_params['targets'] = len(t_list)

    train_df['tokens'] = train_df['Tweet'].map(lambda x: tokenize(x))
    test_df['tokens'] = test_df['Tweet'].map(lambda x: tokenize(x))
    train_df['lengths'] = train_df['tokens'].map(lambda x: len(x))
    test_df['lengths'] = test_df['tokens'].map(lambda x: len(x))

    word_freq_dict = create_freq_vocabulary(
        list(train_df['tokens']) + list(test_df['tokens']))

    tokens = get_top_freq_words(word_freq_dict, 1)

    train_df = train_df.sort_values(by="lengths")
    test_df = test_df.sort_values(by="lengths")
    embeddings = get_embeddings(path=preprocessing_params['embeddings_path'])
    w2i = create_final_dictionary(tokens,
                                  embeddings,
                                  unk_token=preprocessing_params['unk_token'],
                                  pad_token=preprocessing_params['pad_token'])
    emb_matrix = get_embeddings_matrix(w2i, embeddings,
                                       preprocessing_params['embedding_size'])

    model_params['embeddings'] = emb_matrix

    train_batches = create_batches(train_df,
                                   training_params['batch_size'],
                                   w2i=w2i,
                                   pad_token=preprocessing_params['pad_token'],
                                   unk_token=preprocessing_params['unk_token'],
                                   target_list=t_list)
    test_batches = create_batches(test_df,
                                  training_params['batch_size'],
                                  w2i=w2i,
                                  pad_token=preprocessing_params['pad_token'],
                                  unk_token=preprocessing_params['unk_token'],
                                  target_list=t_list)

    model = ModelFactory.get_model(model_name, model_params)
    optimizer = Adam(model.trainable_weights, training_params['lr'])
    criterion = BCEWithLogitsLoss()
    train(model,
          train_batches,
          test_batches,
          optimizer,
          criterion,
          epochs=training_params['epochs'],
          init_patience=training_params['patience'],
          cuda=False,
          target_list=t_list)
    model = load_model(model)
    full_classification_report(model, test_batches, t_list)
Esempio n. 26
0
def train(FLAGS):
    """
    Train our embeddings.
    """

    # Get data loaders
    print("==> Reading and processing the data ... ", end="")
    train_loader, test_loader, num_unique_words, \
        num_unique_documents, word_to_idx = process_data(
            data_dir=FLAGS.data_dir,
            vocab_size=FLAGS.vocab_size,
            window_size=FLAGS.window_size,
            split_ratio=FLAGS.split_ratio,
            batch_size=FLAGS.batch_size,
            )
    print("[COMPLETE]")

    # Load pretrained GloVe embeddings for our vocab
    embedding_dir = os.path.join(basedir, "../../../../embeddings/glove")
    embedding_dim = 100
    embeddings = get_embeddings(
        embedding_dir=embedding_dir,
        embedding_dim=embedding_dim,
        words=word_to_idx.keys(),
    )

    # Initialize model, criterion, loss
    print("==> Initializing model components ... ", end="")
    model = MLP(
        D_in_words=num_unique_words,
        D_in_documents=num_unique_documents,
        embedding_dim=FLAGS.embedding_dim,
        num_hidden_units=FLAGS.num_hidden_units,
        window_size=FLAGS.window_size,
        embeddings=embeddings,
    )
    # Objective
    criterion = torch.nn.CrossEntropyLoss()
    # Optimizer
    # Only get the parameters with gradients (we freeze our GloVe embeddings)
    parameters = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=FLAGS.lr)
    print("[COMPLETE]")

    # Train the model
    print("==> Training the model ... [IN PROGRESS]")
    model = training_procedure(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        train_loader=train_loader,
        test_loader=test_loader,
        num_epochs=FLAGS.num_epochs,
        learning_rate=FLAGS.lr,
        decay_rate=FLAGS.decay_rate,
        max_grad_norm=FLAGS.max_grad_norm,
        log_every=FLAGS.log_every,
    )
    print("\n[COMPLETE]")

    # Save the model
    print("==> Saving the model ... [IN PROGRESS]")
    torch.save(model, os.path.join(basedir, FLAGS.data_dir, "model.pt"))
    print("\n[COMPLETE]")
Esempio n. 27
0
def run_validation(model,
                   settings,
                   image_size=128,
                   visual=None,
                   visual_location=None):
    """Run validation sequence on model.

  Args:
    model: Model on which to perform the validation.
    image_file: Video sequence used for the validation.
    label_file: Corresponding label file.
    image_size: Size of the bounding boxes after resize.
    visual: Visualize the frame with bounding boxes and ids.
  """
    mot_metric = MOTMetric(auto_id=True)

    # Get the label file.
    with open(settings.labels_file, 'rb') as file:
        labels_dict = pickle.load(file)

    # Open the validation sequence.
    with h5py.File(settings.images_file, 'r') as sequence:
        # Loop over every validation sequence
        for seq in settings.sequences_val:
            # Create embedding database.
            embeds_database = EmbeddingsDatabase(settings.memory_length,
                                                 settings.memory_update)

            # Loop over every frame in the current sequence
            for i, frame in enumerate(sequence['seq' + str(seq)]):
                # Get the ground truth labels for the current frame
                gt_labels = labels_dict['seq' + str(seq)]['frame' + str(i)]

                obj_ids, obj_bbs = [], []
                for label in gt_labels.values():
                    obj_ids.append(label['track_id'])
                    obj_bbs.append([
                        label['left'], label['top'], label['right'],
                        label['bottom']
                    ])

                # Get the embeddings and bouding boxes by running the model
                if settings.detector:
                    embeddings, boxes, labels, probs = model(frame)
                    hyp_bbs = np.array(boxes, dtype=int)
                else:
                    embeddings = get_embeddings(model, frame, gt_labels)
                    hyp_bbs = obj_bbs.copy()

                # Perform the re-identification
                hyp_ids = embeds_database.match_embeddings(
                    embeddings, settings.max_distance)

                # Update the MOT metric.
                mot_metric.update(obj_ids, hyp_ids, np.array(obj_bbs.copy()),
                                  np.array(
                                      hyp_bbs.copy()))  # << CHANGE THIS BACK!

                if visual == 're-id':
                    # Visualize the frame with bouding boxes and ids.
                    show_frame_with_ids(frame,
                                        hyp_bbs.copy(),
                                        hyp_ids,
                                        frame_num=i,
                                        seq_name='seq{}'.format(str(seq)),
                                        visual_location=visual_location)
                elif visual == 'detect':
                    show_frame_with_labels(frame, boxes, labels, probs)

            # Create gif.
            if visual == 're-id':
                # scene_labels = sorted(np.array(os.listdir(scene_label_dir)))
                loc = '{}/seq{}'.format(visual_location, str(seq))
                images = []
                filenames = sorted(np.array(os.listdir(loc)))

                for i in range(len(filenames)):
                    filenames[i] = re.findall(r'\d+', filenames[i])[0]

                filenames = np.array(filenames, dtype=int)
                filenames = sorted(filenames)

                for i in range(len(filenames)):
                    filenames[i] = loc + '/frame' + str(filenames[i]) + '.jpg'

                for filename in filenames:
                    images.append(imageio.imread(filename))
                imageio.mimsave(loc + 'movie.gif', images, duration=0.10)

            if settings.print_embed_avg:
                print('Average embedding cost sequence {}: {:.3f}'.format(
                    str(seq), embeds_database.get_average_cost()))

        # Return the MOT metric object
        return mot_metric
Esempio n. 28
0
    print("Nb test queries: {}".format(len(q_test)))

    # Load gold hypernyms (train and dev only)
    print("Loading gold hypernyms...")
    path_h_train = "{}/training/gold/{}.training.gold.txt".format(
        args.dir_datasets, dataset_name_exp)
    path_h_dev = "{}/trial/gold/{}.trial.gold.txt".format(
        args.dir_datasets, dataset_name_exp)
    h_train = utils.load_hypernyms(path_h_train, normalize=True)
    h_dev = utils.load_hypernyms(path_h_dev, normalize=True)
    print("Nb training pairs: {}".format(sum(len(x) for x in h_train)))
    print("Nb dev pairs: {}".format(sum(len(x) for x in h_dev)))

    # Load word embeddings
    print("Loading pre-trained word embeddings...")
    embed_vocab_list, word2vec = utils.get_embeddings(args.path_embeddings,
                                                      np.float32)
    embed_vocab_set = set(embed_vocab_list)
    print("Nb embeddings: {}".format(len(embed_vocab_list)))

    # Check for candidates that don't have a pre-trained emedding
    print("Checking for candidates that don't have a pre-trained embedding...")
    oov_candidates = set(c for c in candidates if c not in embed_vocab_set)
    print("Nb candidates without a pre-trained embedding: {}".format(
        len(oov_candidates)))
    if len(oov_candidates):
        print("WARNING: {} candidates will be assigned a random embedding.".
              format(len(oov_candidates)))

    # Check for queries that don't have a pre-trained embedding
    print(
        "Checking for training queries that don't have a pre-trained embedding..."