Beispiel #1
0
def load_model(embedding_size, vocab_size, num_steps, dataset):
    reverse_filename = get_reverse_filename(embedding_size, num_steps,
                                            vocab_size, dataset)
    embeddings_filename = get_embeddin_filename(embedding_size, num_steps,
                                                vocab_size, dataset)
    reverse_dictionary = load_pickle_file(reverse_filename)
    final_embeddings = load_pickle_file(embeddings_filename)
    dict_filename = get_dict_filename(embedding_size, num_steps, vocab_size,
                                      dataset)
    dictionary = load_pickle_file(dict_filename)
    return reverse_dictionary, final_embeddings, dictionary
Beispiel #2
0
def hist_plotter():

    colors = ['#F95400', '#0C56A2', '#F9DC00', '#00A670', '#C60074']
    seqgan_dict = load_pickle_file(
        "/Users/markus/workspace/master/Master/seqgan_score_dict.p")
    emb_dict = load_pickle_file(
        "/Users/markus/workspace/master/Master/emb_score_dict.p")

    color_seqgan = colors.pop(0)
    color_emb = colors.pop(0)

    seqgan_intervals = []
    seqgan_intervals_uniq = []
    for (s, (b, n)) in seqgan_dict.iteritems():
        seqgan_intervals_uniq.append(b)
        for _ in range(n):
            seqgan_intervals.append(b)

    emb_intervals = []
    emb_intervals_uniq = []
    for (s, (b, n)) in emb_dict.iteritems():
        emb_intervals_uniq.append(b)
        for _ in range(n):
            emb_intervals.append(b)

    num_bins = 10

    fig, ax = plt.subplots()
    plt.rc('font', family='Arial')

    # the histogram of the data
    data = np.vstack([seqgan_intervals, emb_intervals]).T
    ax.hist(data,
            num_bins,
            color=[color_seqgan, color_emb],
            label=["Baseline", "Our model"])
    # ax.hist(data_uniq, num_bins)
    # ax.hist(emb_intervals, num_bins, normed=1)

    # add a 'best fit' line

    ax.set_xlabel(u'β')
    ax.set_ylabel('Count')

    # Tweak spacing to prevent clipping of ylabel
    fig.tight_layout()
    plt.legend()
    plt.show()
Beispiel #3
0
def get_word_embeddings(conf):
    if conf.WORD_EMBEDDING_METHOD == 'glove':
        embeddings_index = {}
        f = open('data/embeddings/glove.6B.300d.txt')
        count = 0
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
            count += 1
            if count % 100 == 0:
                print_progress(count,
                               400000,
                               prefix="Getting glove word embeddings")
        f.close()
        return embeddings_index

    elif conf.WORD_EMBEDDING_METHOD == 'word2vec':
        embedding_dict_name = "word2vec/saved_models/word2vec_%sd%svoc100001steps_dict_%s.pkl" % (
            conf.EMBEDDING_DIMENSION, conf.NB_WORDS,
            conf.DATASET if conf.DATASET != None else "flickr")
        return load_pickle_file(embedding_dict_name)

    print("WORD_EMBEDDING_METHOD not found")
    return None
Beispiel #4
0
def preprocess_sentences(config, sentences):
    sos_token = "<sos>"
    eos_token = "<eos>"
    pad_token = "<pad>"
    if config[Conf.WORD_EMBEDDING] == WordEmbedding.GLOVE:
        print "Loading Glove dictionary..."
        word_embedding_dict = get_word_embeddings()
        sos_token = "<"
        eos_token = ">"
        pad_token = "="
    else:
        filename = get_dict_filename(config[Conf.EMBEDDING_SIZE],
                                     config[Conf.WORD2VEC_NUM_STEPS],
                                     config[Conf.VOCAB_SIZE],
                                     config[Conf.W2V_SET])
        print "Loading Word2Vec dictionary (%s)..." % filename
        # word_embedding_dict = load_pickle_file("word2vec/saved_models/word2vec_%sd%svoc%ssteps_dict.pkl" % (config[Conf.EMBEDDING_SIZE], config[Conf.VOCAB_SIZE], config[Conf.WORD2VEC_NUM_STEPS]))
        word_embedding_dict = load_pickle_file(filename)

    word_list_sentences = []
    for sentence in sentences:
        word_list = [sos_token]
        for word in sentence.split(" "):
            word_list.append(word.lower())
        word_list.append(eos_token)
        while len(word_list) < config[Conf.MAX_SEQ_LENGTH]:
            word_list.append(pad_token)
        word_list_sentences.append(word_list)
    # word_list_sentences = [[word.lower() for word in sentence.split(" ")] for sentence in sentences]
    return np.asarray(word_list_sentences), word_embedding_dict
Beispiel #5
0
def get_word_embedding_matrix(word_to_id, embedding_dim):
    embeddings_dict = load_pickle_file(
        'word2vec/saved_models/word2vec_50d1000voc100001steps_dict_flowers.pkl'
    )
    embedding_matrix = numpy.zeros((len(word_to_id) + 1, embedding_dim))
    for word, i in word_to_id.items():
        if word in embeddings_dict:
            embedding_matrix[i] = embeddings_dict[word]
Beispiel #6
0
def generate_input_noise(config):
    if config[Conf.PREINIT] == PreInit.ENCODER_DECODER:
        if config[Conf.WORD_EMBEDDING] == WordEmbedding.ONE_HOT:
            noise_size = config[Conf.VOCAB_SIZE]
        else:
            noise_size = config[Conf.EMBEDDING_SIZE]
    else:
        noise_size = config[Conf.NOISE_SIZE]

    if config[Conf.NOISE_MODE] == NoiseMode.REPEAT:
        noise_matrix = np.zeros(
            (config[Conf.BATCH_SIZE], config[Conf.MAX_SEQ_LENGTH], noise_size))
        for batch_index in range(config[Conf.BATCH_SIZE]):
            word_noise = np.random.normal(size=noise_size)
            for word_index in range(config[Conf.MAX_SEQ_LENGTH]):
                noise_matrix[batch_index][word_index] = word_noise

        return noise_matrix

    elif config[Conf.NOISE_MODE] == NoiseMode.REPEAT_SINGLE:
        noise_matrix = np.zeros((config[Conf.BATCH_SIZE], noise_size))
        for batch_index in range(config[Conf.BATCH_SIZE]):
            noise_matrix[batch_index] = np.random.normal(size=noise_size)

        return noise_matrix

    elif config[Conf.NOISE_MODE] == NoiseMode.NEW:
        return np.random.rand(config[Conf.BATCH_SIZE],
                              config[Conf.MAX_SEQ_LENGTH], noise_size)

    elif config[Conf.NOISE_MODE] == NoiseMode.FIRST_ONLY:
        noise_matrix = np.zeros(
            (config[Conf.BATCH_SIZE], config[Conf.MAX_SEQ_LENGTH], noise_size))
        for batch_index in range(config[Conf.BATCH_SIZE]):
            word_noise = np.random.normal(size=noise_size)
            for word_index in range(config[Conf.MAX_SEQ_LENGTH]):
                noise_matrix[batch_index][word_index] = word_noise
        for batch_index in range(config[Conf.BATCH_SIZE]):
            if random.random() < 0.5:
                word_noise = np.zeros(noise_size)
            else:
                word_noise = np.ones(noise_size)
            noise_matrix[batch_index][0] = word_noise
        return noise_matrix

    elif config[Conf.NOISE_MODE] == NoiseMode.ONES:
        return np.ones(
            (config[Conf.BATCH_SIZE], config[Conf.MAX_SEQ_LENGTH], noise_size))

    elif config[Conf.NOISE_MODE] == NoiseMode.ENCODING:
        embedded_data = load_pickle_file(
            "sequence_to_sequence/logs/S2S_2EMB_2017-04-04_VS2+1000_BS128_HD30_DHL1_ED50_SEQ5_WEMword2vec/encoded_data.pkl"
        )
        random_distribution_of_embedded_data = []
        for i in range(config[Conf.BATCH_SIZE]):
            # random_distribution_of_embedded_data.append(embedded_data[np.random.randint(0, len(embedded_data))])
            random_distribution_of_embedded_data.append(embedded_data[i])
        return np.asarray(random_distribution_of_embedded_data)
Beispiel #7
0
def compare_distributions():
	perplexity = 15
	data = "encoded"

	tsne = TSNE(perplexity=perplexity, n_components=2, init='pca', n_iter=5000)

	suffix = "_lambda"
	# suffix = ""
	if data == "encoded":
		embs = load_pickle_file(
			"sequence_to_sequence/logs/NORM_S2S_2EMB_2017-04-07_VS2+1000_BS128_HD40_DHL1_ED50_SEQ5_WEMword2vec/encoded_data_lambda.pkl")
		embs = embs[:1000, 0, :]
	elif data == "word2vec":
		embs = load_pickle_file("word2vec/saved_models/word2vec_50d1000voc100001steps_embs.pkl")
		embs = embs[:1000, :]

	random_uniform = numpy.random.normal(size=embs.shape)
	# embs = normalize(embs, norm="l2")

	append = numpy.append(random_uniform, embs, axis=0)
	embs_pca = tsne.fit_transform(append)

	plot_collections([embs_pca[:1000], embs_pca[1000:]], ["gaussian-yellow", data + "-blue"], perplexity, suffix)
Beispiel #8
0
def gen_class_embs():
	# create_common_words_pickle()
	print "Generating classes"
	common_words = load_pickle_file("common_words.p")
	print "Loading captions..."
	filename_caption_text_tuples = fetch_all_caption_text_tuples()[:5000]
	print "Loading word embeddings..."
	word_embedding_dict = dict(fetch_all_word_vectors())
	filname_text_vector_tuples = []
	tot = len(filename_caption_text_tuples)
	counter = 1
	print_progress(counter, tot, prefix="Converting classes to embs")
	for filename, caption in filename_caption_text_tuples:
		classes = get_classes(caption, common_words)
		filname_text_vector_tuples.extend([(filename, c, word_embedding_dict[c]) for c in classes if c in word_embedding_dict.keys()])
		counter += 1
		print_progress(counter, tot, prefix="Converting classes to embs")

	save_class_vector_list(filname_text_vector_tuples)
Beispiel #9
0
def wmd_retrieval(pred_strindgs, dataset_string_list_sentences):
    filename = get_dict_filename(config[Conf.EMBEDDING_SIZE],
                                 config[Conf.WORD2VEC_NUM_STEPS],
                                 config[Conf.VOCAB_SIZE], config[Conf.W2V_SET])
    word_embedding_dict = load_pickle_file(filename)

    best_sentence_lists = []

    for pred_string in pred_strings:

        score_tuples = []
        for dataset_string_list_sentence in dataset_string_list_sentences:
            dataset_string = " ".join(dataset_string_list_sentence)
            score = get_wmd_distance(pred_string, dataset_string,
                                     word_embedding_dict)
            score_tuples.append((dataset_string, score))
        score_tuples = sorted(score_tuples, key=lambda x: x[1], reverse=False)
        result = [x[0] for x in score_tuples[:5]]

        best_sentence_lists.append(result)

    return best_sentence_lists
Beispiel #10
0
def background_wmd_retrieval(pred_strings, dataset_string_list_sentences):
    filename = get_dict_filename(config[Conf.EMBEDDING_SIZE],
                                 config[Conf.WORD2VEC_NUM_STEPS],
                                 config[Conf.VOCAB_SIZE], config[Conf.W2V_SET])
    word_embedding_dict = load_pickle_file(filename)

    counter = Value('i', 0)
    sentence_count = len(pred_strings)
    cpu_count = multiprocessing.cpu_count()
    print "CPUs:", cpu_count
    if cpu_count > 8 and cpu_count < 15:
        cpu_count = 10
    elif cpu_count > 40:
        cpu_count = 40
    print "Starting pool with %s processes" % cpu_count
    pool = Pool(cpu_count, initializer=init, initargs=(counter, ))
    tuple_array = [(pred_string, dataset_string_list_sentences,
                    word_embedding_dict, sentence_count)
                   for pred_string in pred_strings]
    best_sentence_lists = pool.map(background_wmd, tuple_array, chunksize=1)
    pool.close()
    pool.join()

    return best_sentence_lists
Beispiel #11
0
def plotter():
    colors = ['#F95400', '#0C56A2', '#F9DC00', '#00A670', '#C60074']
    seqgan_dict = load_pickle_file(
        "/Users/markus/workspace/master/Master/seqgan_score_dict.p")
    emb_dict = load_pickle_file(
        "/Users/markus/workspace/master/Master/emb_score_dict.p")
    color_seqgan = colors.pop(0)
    colors.pop(0)
    color_emb = colors.pop(0)
    plt.rc('font', family='Arial')
    buckets = 10
    plt.rcParams.update({'font.size': 20})
    seqgan_count = [0 for _ in range(buckets)]
    seqgan_count_uniq = [0 for _ in range(buckets)]

    emb_count = [0 for _ in range(buckets)]
    emb_count_uniq = [0 for _ in range(buckets)]

    for (n, (c, u)) in seqgan_dict.iteritems():
        seqgan_count[int(c * 10 - 1)] += u
        seqgan_count_uniq[int(c * 10 - 1)] += 1

    for (n, (c, u)) in emb_dict.iteritems():
        emb_count[int(c * 10 - 1)] += u
        emb_count_uniq[int(c * 10 - 1)] += 1

    ind = np.arange(buckets)  # the x locations for the groups
    # width = 0.35  # the width of the bars
    width = 0.49  # the width of the bars
    alpha_uniq = 0.4

    fig, ax = plt.subplots()
    # axes.set_xlim([0.5, 1.0])
    # axes.set_ylim([ymin,ymax])

    seqgan_bars = ax.bar(ind, seqgan_count, width, color=color_seqgan)
    seqgan_bars_uniq = ax.bar(ind,
                              seqgan_count_uniq,
                              width,
                              color='black',
                              alpha=alpha_uniq)
    emb_bars = ax.bar(ind + width, emb_count, width, color=color_emb)
    emb_bars_uniq = ax.bar(ind + width,
                           emb_count_uniq,
                           width,
                           color='black',
                           alpha=alpha_uniq)

    # add some text for labels, title and axes ticks
    ax.set_ylabel('Count')
    ax.set_xlabel(u'β')
    ax.set_xticks(ind + width / 2)
    x_tick_labels = []
    for i in range(buckets):
        x_tick_labels.append("%.1f-%.1f" %
                             (float(i) / buckets, float(i + 1) / buckets))
        # x_tick_labels.append("%.1f" % (float(i + 1) / buckets))
    ax.set_xticklabels(x_tick_labels)
    # ax.set_xlabel(x_tick_labels)
    plt.tick_params(
        axis='x',
        which='both',
        bottom='off',
    )
    ax.legend((seqgan_bars[0], emb_bars[0]),
              ('SeqGan', 'Word Embedding Model'),
              fontsize=20)

    autolabel(seqgan_bars_uniq, ax, seqgan_count)
    autolabel(emb_bars_uniq, ax, emb_count)
    plt.show()