Exemple #1
0
def main(extractor_type, test_image_index, download, raw_data_path,
         processed_data_path, n_clusters, kmeans_data_path, process_data):

    raw_data_path = os.path.abspath(raw_data_path)
    processed_data_path = os.path.abspath(processed_data_path)

    if (download):
        (x_train, y_train), (x_test, y_test) = cifar10.load_data()

        utils.save_pickle(raw_data_path + '/x_train.pkl', x_train)
        utils.save_pickle(raw_data_path + '/y_train.pkl', y_train)
        utils.save_pickle(raw_data_path + '/x_test.pkl', x_test)
        utils.save_pickle(raw_data_path + '/y_test.pkl', y_test)

    kmeans = None
    if (kmeans_data_path):
        kmeans_data_path = os.path.abspath(kmeans_data_path)
        kmeans = utils.load_pickle(kmeans_data_path)

    extractor = get_extractor(extractor_type, raw_data_path,
                              processed_data_path, n_clusters, kmeans,
                              process_data)
    test_data = utils.load_pickle(raw_data_path + '/x_test.pkl')
    im = test_data[test_image_index]
    _, nns = extractor.get_knns(im)
    display_nn(im, nns)
Exemple #2
0
    def __init__(self,
                 extractor,
                 extractor_type,
                 raw_data_path,
                 processed_data_path=None,
                 n_clusters=10,
                 kmeans=None,
                 process_data=False):
        self.processed_data_path = processed_data_path
        self.extractor = extractor
        self.extractor_type = extractor_type
        self.raw_data_path = raw_data_path
        self.train_data = utils.load_pickle(raw_data_path + '/x_train.pkl')
        self.index = np.array(range(len(self.train_data)))
        self.kmeans = kmeans

        if process_data:
            self.train_data_proc = self._process_data()

        self.train_data_proc = utils.load_pickle(processed_data_path + '/' +
                                                 self.extractor_type +
                                                 '/train_data.pkl')

        if kmeans == None:
            self.kmeans = self._cluster_images(n_clusters)
Exemple #3
0
    def get_verb_embs(self, emb_type="init", save_path=None):
        if save_path is not None and os.path.exists(save_path):
            result = load_pickle(filename=save_path)
            return result["vocab"], result["vectors"]

        verb_dict = self.verb_dict.copy()
        verb_dict.pop(UNK)  # remove UNK

        if emb_type == "init":
            verb_embs = self.sess.run(self.verb_embeddings)[1:]  # remove UNK
            verb_vocab, verb_vectors = list(), list()

            for verb, idx in tqdm(verb_dict.items(), total=len(verb_dict), desc="extract verb embeddings"):
                verb_vocab.append(verb)
                verb_vectors.append(verb_embs[idx])

            result = {"vocab": verb_vocab, "vectors": np.asarray(verb_vectors)}

        elif emb_type == "target":
            verb_vocab, verb_vectors = list(), list()

            for verb, idx in tqdm(verb_dict.items(), total=len(verb_dict), desc="extract verb representations"):
                verb_vector = self.sess.run(self.target_verb, feed_dict={self.verb: [idx]})
                verb_vocab.append(verb)
                verb_vectors.append(np.reshape(verb_vector, newshape=(self.cfg.k, )))

            result = {"vocab": verb_vocab, "vectors": np.asarray(verb_vectors)}

        else:
            raise ValueError("Unknown emb type...")

        if save_path is not None:
            write_pickle(result, filename=save_path)

        return result["vocab"], result["vectors"]
Exemple #4
0
def clustering(vectors,
               vocab,
               num_clusters,
               cluster_method="kmeans",
               save_path=None,
               norm=True,
               norm_method="l2"):
    if save_path is not None and os.path.exists(save_path):
        return load_pickle(save_path)
    else:
        if norm:
            vectors = normalize_vectors(vectors, norm_method=norm_method)

        print("k-means clustering...")
        labels, centroids, score, silhouette_score = kmeans_clustering(
            vectors,
            clusters=num_clusters,
            init="k-means++",
            n_init=20,
            max_iter=10000,
            tol=1e-12,
            verbose=0)
        print(
            "Score (opposite of the value of embeddings on the K-means objective) is the sum of {}"
            .format(score))
        print("Silhouette score: {}".format(silhouette_score))

        clusters = compute_distance(vocab=vocab,
                                    labels=labels,
                                    vectors=vectors,
                                    centroids=centroids,
                                    dist_method="cosine",
                                    keep_score=False)

        if cluster_method == "kmeans":
            write_pickle(clusters, filename=save_path)
            return clusters

        elif cluster_method == "knearest":
            clusters_dict = dict()
            for cluster_idx, verb in tqdm(clusters.items(),
                                          total=len(clusters),
                                          desc="compute k-nearest verbs"):
                # key_verb = next(iter(verb))
                key_verb = verb[0]
                sub_verbs = compute_knearest(verb=key_verb,
                                             vocab=vocab,
                                             vectors=vectors,
                                             dist_method="cosine",
                                             top_k=100)
                clusters_dict[cluster_idx] = [key_verb] + sub_verbs
            write_pickle(clusters_dict, filename=save_path)
            return clusters_dict

        else:
            raise ValueError(
                "Unsupported clustering method, only [kmeans | knearest] are utilized!!!"
            )
Exemple #5
0
 def _process_data(self):
     phases = ['train', 'test']
     for phase in phases:
         data = utils.load_pickle(self.raw_data_path + '/x_' + phase +
                                  '.pkl')
         feat = utils.apply_func_to_data(self.extract_features, data)
         utils.save_pickle(
             self.processed_data_path + '/' + self.extractor_type + '/' +
             phase + '_data.pkl', feat)
Exemple #6
0
def test(args):
    if args.thread_restrict is True:
        cfg_proto = tf.ConfigProto(intra_op_parallelism_threads=2)
    else:
        cfg_proto = None
    with tf.Session(config=cfg_proto) as sess:
        # Loading the vocabulary files
        vocab, rev_vocab = load_vocab(args)
        args.vocab_size = len(rev_vocab)
        # Creating test model

        # Hacky way to get seq_len
        test_set = load_pickle(args, split='test')
        args.config.seq_len = test_set[0]['sentence_len']

        # Creating training model
        if args.config.elmo is True:
            elmo = hub.Module("https://tfhub.dev/google/elmo/1",
                              trainable=True)
        else:
            elmo = None

        with tf.variable_scope("model", reuse=None):
            model_test = SentimentModel(args,
                                        queue=None,
                                        mode='eval',
                                        elmo=elmo)
        # Reload model from checkpoints, if any
        steps_done = initialize_weights(sess, model_test, args, mode='test')
        logger.info("loaded %d completed steps", steps_done)

        for split in args.eval_splits.split(','):
            test_set = load_pickle(args, split=split)
            results, losses = evaluate(sess, model_test, test_set, args)
            if args.mode != 'train':
                detailed_results(args, split, test_set, rev_vocab, results)
            percent_correct = float(len(
                results['correct'])) * 100.0 / len(test_set)
            logger.info("correct predictions on %s - %.4f. Eval Losses - %.4f",
                        split, percent_correct, losses)
Exemple #7
0
def analysis(args):
    if args.thread_restrict is True:
        cfg_proto = tf.ConfigProto(intra_op_parallelism_threads=2)
    else:
        cfg_proto = None
    with tf.Session(config=cfg_proto) as sess:
        # Loading the vocabulary files
        vocab, rev_vocab = load_vocab(args)
        args.vocab_size = len(rev_vocab)
        # Creating test model

        train_set = load_pickle(args, split='train')
        args.config.seq_len = train_set[0]['sentence_len']
        args.config.eval_batch_size = 1
        # Creating training model
        if args.config.elmo is True:
            elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)
        else:
            elmo = None

        with tf.variable_scope("model", reuse=None):
            model_test = SentimentModel(args, queue=None, mode='eval', elmo=elmo)

        # Reload model from checkpoints, if any
        steps_done = initialize_weights(sess, model_test, args, mode='test')
        logger.info("loaded %d completed steps", steps_done)

        logicnn.append_features(args, train_set, model_test, vocab, rev_vocab)

        dev_set = load_pickle(args, split='dev')
        logicnn.append_features(args, dev_set, model_test, vocab, rev_vocab)

        test_set = load_pickle(args, split='test')
        logicnn.append_features(args, test_set, model_test, vocab, rev_vocab)

        if args.config.elmo is True:
            elmo_embedding_analysis(sess, model_test, test_set)
        else:
            w2v_embedding_analysis(sess, model_test, test_set)