Example #1
0
def produce_image_from_model(model_fn, logdir, **unused):

    dataset = input_data.read_data_sets("data/MNIST/",
                                        one_hot=False,
                                        reshape=False)
    inputs = np.concatenate(
        [x.images for x in [dataset.train, dataset.validation, dataset.test]])
    labels = np.concatenate(
        [x.labels for x in [dataset.train, dataset.validation, dataset.test]])
    images = vector_to_matrix_mnist(inputs)
    sprite_array = 1 - create_sprite_image(images)
    imsave(f'{logdir}/sprites.png', sprite_array)

    batch_size = None
    input_t = tf.placeholder(dtype=tf.float32,
                             name='input_t',
                             shape=(batch_size, *inputs.shape[1:]))
    z_t = model_fn(input_t, 2)
    tf.train.get_or_create_global_step()

    sess = tf.train.MonitoredTrainingSession(checkpoint_dir=logdir,
                                             save_checkpoint_secs=None,
                                             save_summaries_steps=None)

    print('generating points....')
    batch_size = 1000
    points = np.concatenate([
        sess.run(z_t, feed_dict={input_t: inputs[i:i + batch_size]})
        for i in tqdm(range(0, len(inputs), batch_size))
    ])
    # add another zeroed dimension to get 3 dimensions for tensorflow projector
    points = np.stack(
        (points[:, 0], points[:, 1], np.zeros(shape=len(points))), axis=1)
    save_embeddings(outputs=points, labels=labels, logdir=logdir)
Example #2
0
def evaluate(model, test_loader, test_embeddings, save=True, model_name=None):
    mean_pred_embeddings = predict_mean_embeddings(model, test_loader)

    if save:
        if model_name == None:
            raise ValueError('A filename should be provided.')
        save_embeddings(mean_pred_embeddings, model_name)

    predicted_results = {}

    euclidean_distances = []
    cos_sims = []

    nb_of_pred = 0
    for label in mean_pred_embeddings:
        if label in test_embeddings:
            y_pred = mean_pred_embeddings[label].reshape(1, -1)
            y_true = test_embeddings[label].reshape(1, -1)
            euclidean_distances.append(eucl_dist(y_true, y_pred))
            cos_sims.append(cos_sim(y_true, y_pred))
            nb_of_pred += 1

    logging.info('\nResults on the test:')
    logging.info('Mean euclidean dist: {}'.format(
        np.mean(euclidean_distances)))
    logging.info('Variance of euclidean dist: {}'.format(
        np.std(euclidean_distances)))
    logging.info('Mean cosine sim: {}'.format(np.mean(cos_sims)))
    logging.info('Variance of cosine sim: {}'.format(np.std(cos_sims)))
    logging.info('Number of labels evaluated: {}'.format(nb_of_pred))
    return mean_pred_embeddings
Example #3
0
 def save(self):
     ent_embeds = self.ent_embeds.eval(session=self.session)
     nv_ent_embeds = self.name_embeds.eval(session=self.session)
     rv_ent_embeds = self.rv_ent_embeds.eval(session=self.session)
     av_ent_embeds = self.av_ent_embeds.eval(session=self.session)
     rel_embeds = self.rel_embeds.eval(session=self.session)
     att_embeds = self.attr_embeds.eval(session=self.session)
     save_embeddings(self.out_folder, self.kgs, ent_embeds, nv_ent_embeds,
                     rv_ent_embeds, av_ent_embeds, rel_embeds, att_embeds)
Example #4
0
def train_skipgram (corpus_dir, extn, learning_rate, embedding_size, num_negsample, epochs, batch_size, output_dir,valid_size):
    '''

    :param corpus_dir: folder containing WL kernel relabeled files. All the files in this folder will be relabled
    according to WL relabeling strategy and the format of each line in these folders shall be: <target> <context 1> <context 2>....
    :param extn: Extension of the WL relabled file
    :param learning_rate: learning rate for the skipgram model (will involve a linear decay)
    :param embedding_size: number of dimensions to be used for learning subgraph representations
    :param num_negsample: number of negative samples to be used by the skipgram model
    :param epochs: number of iterations the dataset is traversed by the skipgram model
    :param batch_size: size of each batch for the skipgram model
    :param output_dir: the folder where embedding file will be stored
    :param valid_size: number of subgraphs to be chosen at random to validate the goodness of subgraph representation
    learning process in every epoc
    :return: name of the file that contains the subgraph embeddings (in word2vec format proposed by Mikolov et al (2013))
    '''

    op_fname = '_'.join([os.path.basename(corpus_dir), 'dims', str(embedding_size), 'epochs', str(epochs),'embeddings.txt'])
    op_fname = os.path.join(output_dir, op_fname)
    if os.path.isfile(op_fname):
        logging.info('The embedding file: {} is already present, hence NOT training skipgram model '
                     'for subgraph vectors'.format(op_fname))
        return op_fname

    logging.info("Initializing SKIPGRAM...")
    corpus = Corpus(corpus_dir, extn = extn, max_files=0)  # just load 'max_files' files from this folder
    corpus.scan_and_load_corpus()
    valid_examples = np.concatenate((np.random.choice(corpus.high_freq_word_ids, valid_size, replace=False),
                                     np.random.choice(corpus.low_freq_word_ids, valid_size, replace=False)))

    model_skipgram = skipgram(
        doc_size=corpus._vocabsize,  # for doc2vec skipgram model, the doc size should be same as word size
        vocabulary_size=corpus._vocabsize,  # size of i/p and o/p layers
        learning_rate=learning_rate,  # will decay over time?
        embedding_size=embedding_size,  # hidden layer neurons
        num_negsample=num_negsample,
        num_steps=epochs,  # no. of time the training set will be iterated through
        corpus=corpus,  # data set of (target,context) tuples
        valid_dataset=valid_examples,  # validation set (a small subset) of (target, context) tuples?
    )

    final_embeddings, final_weights = model_skipgram.train(
        corpus=corpus,
        batch_size=batch_size,
        valid_dataset=valid_examples,
    )


    logging.info('Write the matrix to a word2vec format file')
    save_embeddings(corpus, final_embeddings, embedding_size, op_fname)
    logging.info('Completed writing the final embeddings, pls check file: {} for the same'.format(op_fname))
    return op_fname
Example #5
0
def predict_OOV(model, char_to_idx, OOV_path, filename):
    OOVs = load_vocab(OOV_path)

    vectorizer = Vectorizer(char_to_idx)
    examples = [(vectorizer.vectorize_sequence(word), word) for word in OOVs]
    loader = DataLoader(examples,
                        collate_fn=collate_x,
                        use_gpu=False,
                        batch_size=1)

    model.model.eval()
    predicted_embeddings = {}
    for x, y in loader:
        x = tensors_to_variables(x)
        embeddings = torch_to_numpy(model.model(x))
        for label, embedding in zip(y, embeddings):
            predicted_embeddings[label] = embedding

    save_embeddings(predicted_embeddings, filename)
Example #6
0
def train():
    model = get_loaded_model(force_gpu=True)
    embeddings = get_embeddings(model=model, data=get_data())
    save_embeddings(embeddings, embeddings_file)
Example #7
0
            x, target, label = batch['embedding'], batch['embedding'], batch['label']
            (decoded, predict) = net(x)
            
            # loss
            val_dec_loss[0].append(sparse_autoencoder_error(decoded[0], target[0]))
            val_dec_loss[1].append(sparse_autoencoder_error(decoded[1], target[1]))
            val_dec_loss[2].append(sparse_autoencoder_error(decoded[2], target[2]))
            val_cla_loss.append(BCE_loss(predict, label))
            val_loss.append(sum([val_dec_loss[i][it] for i in range(3)]) * opt.alpha + val_cla_loss[it])
            # metrics
            val_mse[0].append(MSE(decoded[0], target[0]))
            val_mse[1].append(MSE(decoded[1], target[1]))
            val_mse[2].append(MSE(decoded[2], target[2]))
            val_acc.append((predict > 0.5) == label.byte())
            it += 1

        net.train()
    print("   validation: \n       val_loss: %4f, decode_0: %4f, decode_1: %4f, decode_2: %4f, classify_loss: %4f"
         % (Mean(val_loss), Mean(val_dec_loss[0]),Mean(val_dec_loss[1]), Mean(val_dec_loss[2]), Mean(val_cla_loss)))
    print("       val_mean_square_error: decode_0: %4f, decode_1: %4f, decode_2: %4f"
         % (Mean(val_mse[0]), Mean(val_mse[1]), Mean(val_mse[2])))
    print("       val_classify_accuracy: %.4f" % Mean(val_acc))
    

# save model & embeddings
save_model(net, opt)
save_embeddings(net, dataset, opt)

end_time = time.time()
print("time: ", end_time - begin_time)
def test(experiment_name,
         task,
         gpu_num=0,
         pretrained='',
         margin=0.4,
         losstype='deepcca'):
    cosined = False
    embed_dim = 1024
    gpu_num = int(gpu_num)
    margin = float(margin)

    # Setup the results and device.
    results_dir = setup_dirs(experiment_name)
    if not os.path.exists(results_dir + 'test_results/'):
        os.makedirs(results_dir + 'test_results/')
    test_results_dir = results_dir + 'test_results/'

    device = setup_device(gpu_num)

    #### Hyperparameters #####
    #Initialize wandb
    #import wandb
    #wandb.init(project=experiment_name)
    #config = wandb.config

    with open(results_dir + 'hyperparams_test.txt', 'w') as f:
        f.write('Command used to run: python ')
        f.write(' '.join(sys.argv))
        f.write('\n')
        f.write('device in use: ' + str(device))
        f.write('\n')
        f.write('--experiment_name ' + str(experiment_name))
        f.write('\n')

    # Setup data loaders and models based on task.
    if task == 'cifar10':
        train_loader, test_loader = cifar10_loaders()
        model_A = CIFAREmbeddingNet()
        model_B = CIFAREmbeddingNet()
    elif task == 'mnist':
        train_loader, test_loader = mnist_loaders()
        model_A = MNISTEmbeddingNet()
        model_B = MNISTEmbeddingNet()
    elif task == 'uw':
        uw_data = 'bert'
        train_loader, test_loader = uw_loaders(uw_data)
        if uw_data == 'bert':
            model_A = RowNet(3072, embed_dim=1024)  # Language.
            model_B = RowNet(4096, embed_dim=1024)  # Vision.

    # Finish model setup.
    model_A.load_state_dict(
        torch.load(results_dir + 'train_results/model_A_state.pt'))
    model_B.load_state_dict(
        torch.load(results_dir + 'train_results/model_B_state.pt'))
    model_A.to(device)
    model_B.to(device)
    # Put models into evaluation mode.
    model_A.eval()
    model_B.eval()
    """For UW data."""
    ## we use train data to calculate the threshhold for distance.
    a_train = []
    b_train = []
    # loading saved embeddings to be faster
    a_train = load_embeddings(test_results_dir + 'lang_embeds_train.npy')
    b_train = load_embeddings(test_results_dir + 'img_embeds_train.npy')

    # Iterate through the train data.
    if a_train is None or b_train is None:
        a_train = []
        b_train = []
        print(
            "Computing embeddings for train data to calculate threshhold for distance"
        )
        for data in train_loader:
            anchor_data = data[0].to(device)
            positive_data = data[1].to(device)
            label = data[2]
            a_train.append(
                model_A(anchor_data.to(device)).cpu().detach().numpy())
            b_train.append(
                model_B(positive_data.to(device)).cpu().detach().numpy())
        print("Finished Computing embeddings for train data")
    #saving embeddings if not already saved
    save_embeddings(test_results_dir + 'lang_embeds_train.npy', a_train)
    save_embeddings(test_results_dir + 'img_embeds_train.npy', b_train)

    a_train = np.concatenate(a_train, axis=0)
    b_train = np.concatenate(b_train, axis=0)

    # Test data
    # For accumulating predictions to check embedding visually using test set.
    # a is embeddings from domain A, b is embeddings from domain B, ys is their labels
    a = []
    b = []
    ys = []
    instance_data = []

    # loading saved embeddings to be faster
    a = load_embeddings(test_results_dir + 'lang_embeds.npy')
    b = load_embeddings(test_results_dir + 'img_embeds.npy')
    if a is None or b is None:
        compute_test_embeddings = True
        a = []
        b = []

    # Iterate through the test data.
    print("computing embeddings for test data")
    for data in test_loader:
        language_data, vision_data, object_name, instance_name = data
        language_data = language_data.to(device)
        vision_data = vision_data.to(device)
        instance_data.extend(instance_name)
        if compute_test_embeddings:
            a.append(
                model_A(language_data).cpu().detach().numpy())  # Language.
            b.append(model_B(vision_data).cpu().detach().numpy())  # Vision.
        ys.extend(object_name)
    print("finished computing embeddings for test data")
    # Convert string labels to ints.
    labelencoder = LabelEncoder()
    labelencoder.fit(ys)
    ys = labelencoder.transform(ys)

    #saving embeddings if not already saved
    save_embeddings(test_results_dir + 'lang_embeds.npy', a)
    save_embeddings(test_results_dir + 'img_embeds.npy', b)

    # Concatenate predictions.
    a = np.concatenate(a, axis=0)
    b = np.concatenate(b, axis=0)
    ab = np.concatenate((a, b), axis=0)

    ground_truth, predicted, distance = object_identification_task_classifier(
        a, b, ys, a_train, b_train, lamb_std=1, cosine=cosined)

    #### Retrieval task by giving an image and finding the closest word descriptions ####
    ground_truth_word, predicted_word, distance_word = object_identification_task_classifier(
        b, a, ys, b_train, a_train, lamb_std=1, cosine=cosined)
    with open('retrieval_non_pro.csv', mode='w') as retrieval_non_pro:
        csv_file_writer = csv.writer(retrieval_non_pro,
                                     delimiter=',',
                                     quotechar='"',
                                     quoting=csv.QUOTE_MINIMAL)
        csv_file_writer.writerow(
            ['image', 'language', 'predicted', 'ground truth'])
        for i in range(50):
            csv_file_writer.writerow([
                instance_data[0], instance_data[i], predicted_word[0][i],
                ground_truth_word[0][i]
            ])

    precisions = []
    recalls = []
    f1s = []
    precisions_pos = []
    recalls_pos = []
    f1s_pos = []
    #print(classification_report(oit_res[i], 1/np.arange(1,len(oit_res[i])+1) > 0.01))
    for i in range(len(ground_truth)):
        p, r, f, s = precision_recall_fscore_support(ground_truth[i],
                                                     predicted[i],
                                                     warn_for=(),
                                                     average='micro')
        precisions.append(p)
        recalls.append(r)
        f1s.append(f)
        p, r, f, s = precision_recall_fscore_support(ground_truth[i],
                                                     predicted[i],
                                                     warn_for=(),
                                                     average='binary')
        precisions_pos.append(p)
        recalls_pos.append(r)
        f1s_pos.append(f)

    print('\n ')
    print(experiment_name + '_' + str(embed_dim))
    print('MRR,    KNN,    Corr,   Mean F1,    Mean F1 (pos only)')
    print('%.3g & %.3g & %.3g & %.3g & %.3g' %
          (mean_reciprocal_rank(
              a, b, ys, cosine=cosined), knn(a, b, ys, k=5, cosine=cosined),
           corr_between(a, b, cosine=cosined), np.mean(f1s), np.mean(f1s_pos)))

    plt.figure(figsize=(14, 7))
    for i in range(len(ground_truth)):
        fpr, tpr, thres = roc_curve(ground_truth[i],
                                    [1 - e for e in distance[i]],
                                    drop_intermediate=True)
        plt.plot(fpr, tpr, alpha=0.08, color='r')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.savefig(test_results_dir + '_' + str(embed_dim) + '_ROC.svg')

    # Pick a pair, plot distance in A vs distance in B. Should be correlated.
    a_dists = []
    b_dists = []
    for _ in range(3000):
        i1 = random.randrange(len(a))
        i2 = random.randrange(len(a))
        a_dists.append(euclidean(a[i1], a[i2]))
        b_dists.append(euclidean(b[i1], b[i2]))
    #     a_dists.append(cosine(a[i1], a[i2]))
    #     b_dists.append(cosine(b[i1], b[i2]))

    # Plot.
    plt.figure(figsize=(14, 14))
    #plt.title('Check Distance Correlation Between Domains')
    plt.xlim([0, 3])
    plt.ylim([0, 3])
    # plt.xlim([0,max(a_dists)])
    # plt.ylim([0,max(b_dists)])
    # plt.xlabel('Distance in Domain A')
    # plt.ylabel('Distance in Domain B')
    plt.xlabel('Distance in Language Domain')
    plt.ylabel('Distance in Vision Domain')
    #plt.plot(a_dists_norm[0],b_dists_norm[0],'.')
    #plt.plot(np.arange(0,2)/20,np.arange(0,2)/20,'k-',lw=3)
    plt.plot(a_dists, b_dists, 'o', alpha=0.5)
    plt.plot(np.arange(0, 600), np.arange(0, 600), 'k--', lw=3, alpha=0.5)
    #plt.text(-0.001, -0.01, 'Corr: %.3f'%(pearsonr(a_dists,b_dists)[0]),  fontsize=20)
    plt.savefig(test_results_dir + '_' + str(embed_dim) + '_CORR.svg')

    # Inspect embedding distances.
    clas = 5  # Base class.
    i_clas = [i for i in range(len(ys)) if ys[i].item() == clas]
    i_clas_2 = np.random.choice(i_clas, len(i_clas), replace=False)

    clas_ref = 4  # Comparison class.
    i_clas_ref = [i for i in range(len(ys)) if ys[i].item() == clas_ref]

    ac = np.array([a[i] for i in i_clas])
    bc = np.array([b[i] for i in i_clas])

    ac2 = np.array([a[i] for i in i_clas_2])
    bc2 = np.array([b[i] for i in i_clas_2])

    ac_ref = np.array([a[i] for i in i_clas_ref])
    aa_diff_ref = norm(ac[:min(len(ac), len(ac_ref))] -
                       ac_ref[:min(len(ac), len(ac_ref))],
                       ord=2,
                       axis=1)

    ab_diff = norm(ac - bc2, ord=2, axis=1)
    aa_diff = norm(ac - ac2, ord=2, axis=1)
    bb_diff = norm(bc - bc2, ord=2, axis=1)

    # aa_diff_ref = [cosine(ac[:min(len(ac),len(ac_ref))][i],ac_ref[:min(len(ac),len(ac_ref))][i]) for i in range(len(ac[:min(len(ac),len(ac_ref))]))]

    # ab_diff = [cosine(ac[i],bc2[i]) for i in range(len(ac))]
    # aa_diff = [cosine(ac[i],ac2[i]) for i in range(len(ac))]
    # bb_diff = [cosine(bc[i],bc2[i]) for i in range(len(ac))]

    bins = np.linspace(0, 0.1, 100)

    plt.figure(figsize=(14, 7))
    plt.hist(ab_diff, bins, alpha=0.5, label='between embeddings')
    plt.hist(aa_diff, bins, alpha=0.5, label='within embedding A')
    plt.hist(bb_diff, bins, alpha=0.5, label='within embedding B')

    plt.hist(aa_diff_ref,
             bins,
             alpha=0.5,
             label='embedding A, from class ' + str(clas_ref))

    plt.title('Embedding Distances - Class: ' + str(clas))
    plt.xlabel('L2 Distance')
    plt.ylabel('Count')
    plt.legend()

    #labelencoder.classes_
    classes_to_keep = [36, 6, 9, 46, 15, 47, 50, 22, 26, 28]
    print(labelencoder.inverse_transform(classes_to_keep))

    ab_norm = [
        e for i, e in enumerate(ab) if ys[i % len(ys)] in classes_to_keep
    ]
    ys_norm = [e for e in ys if e in classes_to_keep]

    color_index = {list(set(ys_norm))[i]: i
                   for i in range(len(set(ys_norm)))}  #set(ys_norm)
    markers = ["o", "v", "^", "s", "*", "+", "x", "D", "h", "4"]
    marker_index = {
        list(set(ys_norm))[i]: markers[i]
        for i in range(len(set(ys_norm)))
    }

    embedding = umap.UMAP(n_components=2).fit_transform(
        ab_norm)  # metric='cosine'
    # Plot UMAP embedding of embeddings for all classes.
    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

    mid = len(ys_norm)

    ax1.set_title('Language UMAP')
    for e in list(set(ys_norm)):
        x1 = [
            embedding[:mid, 0][i] for i in range(len(ys_norm))
            if ys_norm[i] == e
        ]
        x2 = [
            embedding[:mid, 1][i] for i in range(len(ys_norm))
            if ys_norm[i] == e
        ]
        ax1.scatter(
            x1,
            x2,
            marker=marker_index[int(e)],
            alpha=0.5,
            c=[sns.color_palette("colorblind", 10)[color_index[int(e)]]],
            label=labelencoder.inverse_transform([int(e)])[0])
    ax1.set_xlim([min(embedding[:, 0]) - 4, max(embedding[:, 0]) + 4])
    ax1.set_ylim([min(embedding[:, 1]) - 4, max(embedding[:, 1]) + 4])
    ax1.grid(True)
    ax1.legend(loc='upper center',
               bbox_to_anchor=(1.1, -0.08),
               fancybox=True,
               shadow=True,
               ncol=5)

    ax2.set_title('Vision UMAP')
    for e in list(set(ys_norm)):
        x1 = [
            embedding[mid::, 0][i] for i in range(len(ys_norm))
            if ys_norm[i] == e
        ]
        x2 = [
            embedding[mid::, 1][i] for i in range(len(ys_norm))
            if ys_norm[i] == e
        ]
        ax2.scatter(
            x1,
            x2,
            marker=marker_index[int(e)],
            alpha=0.5,
            c=[sns.color_palette("colorblind", 10)[color_index[int(e)]]])
    ax2.set_xlim([min(embedding[:, 0]) - 4, max(embedding[:, 0]) + 4])
    ax2.set_ylim([min(embedding[:, 1]) - 4, max(embedding[:, 1]) + 4])
    ax2.grid(True)

    plt.savefig(test_results_dir + '_' + str(embed_dim) + '_UMAP_wl.svg',
                bbox_inches='tight')
Example #9
0
def train_skipgram(corpus_dir, extn, learning_rate, embedding_size,
                   num_negsample, epochs, batch_size, output_dir, valid_size):
    '''

    :param corpus_dir: folder containing WL kernel relabeled files. All the files in this folder will be relabled
    according to WL relabeling strategy and the format of each line in these folders shall be: <target> <context 1> <context 2>....
    :param extn: Extension of the WL relabled file
    :param learning_rate: learning rate for the skipgram model (will involve a linear decay)
    :param embedding_size: number of dimensions to be used for learning subgraph representations
    :param num_negsample: number of negative samples to be used by the skipgram model
    :param epochs: number of iterations the dataset is traversed by the skipgram model
    :param batch_size: size of each batch for the skipgram model
    :param output_dir: the folder where embedding file will be stored
    :param valid_size: number of subgraphs to be chosen at random to validate the goodness of subgraph representation
    learning process in every epoc
    :return: name of the file that contains the subgraph embeddings (in word2vec format proposed by Mikolov et al (2013))
    '''

    op_fname = '_'.join([
        os.path.basename(corpus_dir), 'dims',
        str(embedding_size), 'epochs',
        str(epochs), 'embeddings.txt'
    ])
    op_fname = os.path.join(output_dir, op_fname)
    if os.path.isfile(op_fname):
        logging.info(
            'The embedding file: {} is already present, hence NOT training skipgram model '
            'for subgraph vectors'.format(op_fname))
        return op_fname

    logging.info("Initializing SKIPGRAM...")
    corpus = Corpus(
        corpus_dir, extn=extn,
        max_files=0)  # just load 'max_files' files from this folder
    corpus.scan_and_load_corpus()
    valid_examples = np.concatenate(
        (np.random.choice(corpus.high_freq_word_ids, valid_size,
                          replace=False),
         np.random.choice(corpus.low_freq_word_ids, valid_size,
                          replace=False)))

    model_skipgram = skipgram(
        doc_size=corpus.
        _vocabsize,  # for doc2vec skipgram model, the doc size should be same as word size
        vocabulary_size=corpus._vocabsize,  # size of i/p and o/p layers
        learning_rate=learning_rate,  # will decay over time?
        embedding_size=embedding_size,  # hidden layer neurons
        num_negsample=num_negsample,
        num_steps=
        epochs,  # no. of time the training set will be iterated through
        corpus=corpus,  # data set of (target,context) tuples
        valid_dataset=
        valid_examples,  # validation set (a small subset) of (target, context) tuples?
    )

    final_embeddings, final_weights = model_skipgram.train(
        corpus=corpus,
        batch_size=batch_size,
        valid_dataset=valid_examples,
    )

    logging.info('Write the matrix to a word2vec format file')
    save_embeddings(corpus, final_embeddings, embedding_size, op_fname)
    logging.info(
        'Completed writing the final embeddings, pls check file: {} for the same'
        .format(op_fname))
    return op_fname
Example #10
0
def save_char_embeddings(model, char_to_idx, filename='mimick_char_embeddings'):
    char_embeddings = {}
    for char, idx in char_to_idx.items():
        char_embeddings[char] = torch_to_numpy(model.model.mimick_lstm.embeddings.weight.data[idx])
    save_embeddings(char_embeddings, filename)
                    os.path.join(store_path, photo_name))

    products_df_final = products_df_final.sort_values(by="photo")
    products_df_final.to_csv(os.path.join("..", "utils", "products.csv"),
                             index=False)


if __name__ == "__main__":
    seed = 2018
    csvs_path = os.path.join("..", "notebooks")
    dataset_path = os.path.join("..", "photos_resized")
    store_path = os.path.join("..", "static", "images", "store")

    remove_gitkeep(store_path)  #removes gitkeep files
    remove_gitkeep(os.path.join("..", "static", "images", "recommend"))
    remove_gitkeep(os.path.join("..", "static", "images", "user"))

    start_store(
        seed, csvs_path, dataset_path, store_path
    )  #creating the store dataframe and placing the images on the store directory

    resizing = (224, 224)
    shape_output = 1024
    model = MobileNet(input_shape=(224, 224, 3),
                      weights="imagenet",
                      include_top=False,
                      pooling="avg")
    save_embeddings(
        store_path, "embeddings.npy", model, preprocess_input, shape_output,
        resizing
    )  #creating the embeddings file for the store imagescess_input, shape_output, resizing) #creating the embeddings file for the store imagescess_input, shape_output, resizing) #creating the embeddings file for the store images
Example #12
0
def build(train_data, save_word_embeddings=False, save_model=False):
    """Returns the sentiment of the parsed sentence.

    Args:
        train_data (df) : The data the model is to be built from.
        save_word_embeddings (bool) : If true, will save the embedding data.
        save_model (bool) : If true, will save the model.

    Returns
        model : The sentiment analyser model, fit to the training data.
    """
    global corpus_vocabulary

    X_train = train_data['content']
    y_train = train_data['label']

    vocab_size = 10000  # TODO automatic

    train_sequences = corpus_vocabulary.texts_to_sequences(X_train.values)
    padded_train = keras.preprocessing.sequence.pad_sequences(train_sequences,
                                                              padding='post',
                                                              maxlen=140)
    model = keras.Sequential()
    model.add(keras.layers.Embedding(vocab_size, 40))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(4, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

    model.summary()

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['acc'])

    split = int(len(X_train) / 4)  # number of comments halved

    x_val = padded_train[:split]
    partial_x_train = padded_train[split:]

    y_val = y_train[:split]
    partial_y_train = y_train[split:]

    model.fit(partial_x_train,
              partial_y_train,
              epochs=150,
              batch_size=512,
              validation_data=(x_val, y_val),
              verbose=1)

    if save_word_embeddings == True:
        word_index = corpus_vocabulary.word_index
        save_embeddings(model, word_index)

    if save_model == True:
        import datetime as dt
        now = dt.datetime.now().__str__()
        model.save(os.getcwd() + '/saved_model_data/models/model_' + now +
                   '.h5')
        print("Model saved.")

    return model
Example #13
0
def execute(sentence, save_word_embeddings=False, plot_loss_acc=False):
    """Returns the sentiment of the parsed sentence.

    Args:
        sentence (str) : The sentence to be analised.
        save_word_embeddings (bool) : If true, will save the embedding data.
        plot_loss_acc (bool) : If true, will plot the loss and accuracy during the training of the data.

    Returns
        score (float) : The sentiment score of the sentence. 1 - cyber abusive, 0 - not cyber abusive.
    """

    parsed_test = pd.DataFrame({"content": pd.Series(sentence)})

    current_directory = os.getcwd()
    train_data = pd.read_csv(current_directory + "/data/DataTurks/dump.csv")
    train_data = train_data.sample(frac=1).reset_index(drop=True)

    X_train = train_data['content'][:18000]
    X_test = parsed_test['content']

    y_train = train_data['label'][:18000]

    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(train_data['content'])

    train_sequences = tokenizer.texts_to_sequences(X_train.values)
    test_sequences = tokenizer.texts_to_sequences(X_test.values)

    vocab_size = 10000

    padded_train = keras.preprocessing.sequence.pad_sequences(train_sequences,
                                                              padding='post',
                                                              maxlen=140)
    padded_test = keras.preprocessing.sequence.pad_sequences(test_sequences,
                                                             padding='post',
                                                             maxlen=140)
    model = keras.Sequential()
    model.add(keras.layers.Embedding(vocab_size, 40))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(4, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

    model.summary()

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['acc'])

    split = int(len(X_train) / 4)  # number of comments halved

    x_val = padded_train[:split]
    partial_x_train = padded_train[split:]

    y_val = y_train[:split]
    partial_y_train = y_train[split:]

    history = model.fit(partial_x_train,
                        partial_y_train,
                        epochs=120,
                        batch_size=512,
                        validation_data=(x_val, y_val),
                        verbose=1)

    if save_word_embeddings == True:
        word_index = tokenizer.word_index
        save_embeddings(model, word_index)

    if plot_loss_acc == True:
        history_dict = history.history
        history_dict.keys()

        epochs = range(1, len(history_dict['acc']) + 1)

        plot_accuracy(epochs, history_dict['acc'], history_dict['val_acc'])
        plt.clf()
        plot_loss(epochs, history_dict['loss'], history_dict['val_loss'])

    sentiment_score = model.predict(padded_test)

    return str(sentiment_score[0][0])