def main(extractor_type, test_image_index, download, raw_data_path, processed_data_path, n_clusters, kmeans_data_path, process_data): raw_data_path = os.path.abspath(raw_data_path) processed_data_path = os.path.abspath(processed_data_path) if (download): (x_train, y_train), (x_test, y_test) = cifar10.load_data() utils.save_pickle(raw_data_path + '/x_train.pkl', x_train) utils.save_pickle(raw_data_path + '/y_train.pkl', y_train) utils.save_pickle(raw_data_path + '/x_test.pkl', x_test) utils.save_pickle(raw_data_path + '/y_test.pkl', y_test) kmeans = None if (kmeans_data_path): kmeans_data_path = os.path.abspath(kmeans_data_path) kmeans = utils.load_pickle(kmeans_data_path) extractor = get_extractor(extractor_type, raw_data_path, processed_data_path, n_clusters, kmeans, process_data) test_data = utils.load_pickle(raw_data_path + '/x_test.pkl') im = test_data[test_image_index] _, nns = extractor.get_knns(im) display_nn(im, nns)
def __init__(self, extractor, extractor_type, raw_data_path, processed_data_path=None, n_clusters=10, kmeans=None, process_data=False): self.processed_data_path = processed_data_path self.extractor = extractor self.extractor_type = extractor_type self.raw_data_path = raw_data_path self.train_data = utils.load_pickle(raw_data_path + '/x_train.pkl') self.index = np.array(range(len(self.train_data))) self.kmeans = kmeans if process_data: self.train_data_proc = self._process_data() self.train_data_proc = utils.load_pickle(processed_data_path + '/' + self.extractor_type + '/train_data.pkl') if kmeans == None: self.kmeans = self._cluster_images(n_clusters)
def get_verb_embs(self, emb_type="init", save_path=None): if save_path is not None and os.path.exists(save_path): result = load_pickle(filename=save_path) return result["vocab"], result["vectors"] verb_dict = self.verb_dict.copy() verb_dict.pop(UNK) # remove UNK if emb_type == "init": verb_embs = self.sess.run(self.verb_embeddings)[1:] # remove UNK verb_vocab, verb_vectors = list(), list() for verb, idx in tqdm(verb_dict.items(), total=len(verb_dict), desc="extract verb embeddings"): verb_vocab.append(verb) verb_vectors.append(verb_embs[idx]) result = {"vocab": verb_vocab, "vectors": np.asarray(verb_vectors)} elif emb_type == "target": verb_vocab, verb_vectors = list(), list() for verb, idx in tqdm(verb_dict.items(), total=len(verb_dict), desc="extract verb representations"): verb_vector = self.sess.run(self.target_verb, feed_dict={self.verb: [idx]}) verb_vocab.append(verb) verb_vectors.append(np.reshape(verb_vector, newshape=(self.cfg.k, ))) result = {"vocab": verb_vocab, "vectors": np.asarray(verb_vectors)} else: raise ValueError("Unknown emb type...") if save_path is not None: write_pickle(result, filename=save_path) return result["vocab"], result["vectors"]
def clustering(vectors, vocab, num_clusters, cluster_method="kmeans", save_path=None, norm=True, norm_method="l2"): if save_path is not None and os.path.exists(save_path): return load_pickle(save_path) else: if norm: vectors = normalize_vectors(vectors, norm_method=norm_method) print("k-means clustering...") labels, centroids, score, silhouette_score = kmeans_clustering( vectors, clusters=num_clusters, init="k-means++", n_init=20, max_iter=10000, tol=1e-12, verbose=0) print( "Score (opposite of the value of embeddings on the K-means objective) is the sum of {}" .format(score)) print("Silhouette score: {}".format(silhouette_score)) clusters = compute_distance(vocab=vocab, labels=labels, vectors=vectors, centroids=centroids, dist_method="cosine", keep_score=False) if cluster_method == "kmeans": write_pickle(clusters, filename=save_path) return clusters elif cluster_method == "knearest": clusters_dict = dict() for cluster_idx, verb in tqdm(clusters.items(), total=len(clusters), desc="compute k-nearest verbs"): # key_verb = next(iter(verb)) key_verb = verb[0] sub_verbs = compute_knearest(verb=key_verb, vocab=vocab, vectors=vectors, dist_method="cosine", top_k=100) clusters_dict[cluster_idx] = [key_verb] + sub_verbs write_pickle(clusters_dict, filename=save_path) return clusters_dict else: raise ValueError( "Unsupported clustering method, only [kmeans | knearest] are utilized!!!" )
def _process_data(self): phases = ['train', 'test'] for phase in phases: data = utils.load_pickle(self.raw_data_path + '/x_' + phase + '.pkl') feat = utils.apply_func_to_data(self.extract_features, data) utils.save_pickle( self.processed_data_path + '/' + self.extractor_type + '/' + phase + '_data.pkl', feat)
def test(args): if args.thread_restrict is True: cfg_proto = tf.ConfigProto(intra_op_parallelism_threads=2) else: cfg_proto = None with tf.Session(config=cfg_proto) as sess: # Loading the vocabulary files vocab, rev_vocab = load_vocab(args) args.vocab_size = len(rev_vocab) # Creating test model # Hacky way to get seq_len test_set = load_pickle(args, split='test') args.config.seq_len = test_set[0]['sentence_len'] # Creating training model if args.config.elmo is True: elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True) else: elmo = None with tf.variable_scope("model", reuse=None): model_test = SentimentModel(args, queue=None, mode='eval', elmo=elmo) # Reload model from checkpoints, if any steps_done = initialize_weights(sess, model_test, args, mode='test') logger.info("loaded %d completed steps", steps_done) for split in args.eval_splits.split(','): test_set = load_pickle(args, split=split) results, losses = evaluate(sess, model_test, test_set, args) if args.mode != 'train': detailed_results(args, split, test_set, rev_vocab, results) percent_correct = float(len( results['correct'])) * 100.0 / len(test_set) logger.info("correct predictions on %s - %.4f. Eval Losses - %.4f", split, percent_correct, losses)
def analysis(args): if args.thread_restrict is True: cfg_proto = tf.ConfigProto(intra_op_parallelism_threads=2) else: cfg_proto = None with tf.Session(config=cfg_proto) as sess: # Loading the vocabulary files vocab, rev_vocab = load_vocab(args) args.vocab_size = len(rev_vocab) # Creating test model train_set = load_pickle(args, split='train') args.config.seq_len = train_set[0]['sentence_len'] args.config.eval_batch_size = 1 # Creating training model if args.config.elmo is True: elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True) else: elmo = None with tf.variable_scope("model", reuse=None): model_test = SentimentModel(args, queue=None, mode='eval', elmo=elmo) # Reload model from checkpoints, if any steps_done = initialize_weights(sess, model_test, args, mode='test') logger.info("loaded %d completed steps", steps_done) logicnn.append_features(args, train_set, model_test, vocab, rev_vocab) dev_set = load_pickle(args, split='dev') logicnn.append_features(args, dev_set, model_test, vocab, rev_vocab) test_set = load_pickle(args, split='test') logicnn.append_features(args, test_set, model_test, vocab, rev_vocab) if args.config.elmo is True: elmo_embedding_analysis(sess, model_test, test_set) else: w2v_embedding_analysis(sess, model_test, test_set)