def seed_db(force=False): """ Use the `config/datasets/lm.yml` to generate example datasets and store them into db. :return: None """ config_dir = get_path('config/db', 'sp.yml') with open(config_dir, 'r') as f: config = yaml.safe_load(f)['datasets'] for seed in config: print('seeding {:s} data'.format(seed['name'])) data_dir = get_path('cached_data', seed['dir']) if not path_exists(data_dir): print('Directory "{:s}" not available!'.format(data_dir)) continue seed['scheme'].update({'upsert': force}) if seed['type'] == 'sst': store_sst(data_dir, seed['name'], **seed['scheme']) elif seed['type'] == 'imdb': store_imdb(data_dir, seed['name'], **seed['scheme']) elif seed['type'] == 'yelp': store_yelp(data_dir, seed['name'], **seed['scheme']) else: print('not able to seed datasets with type: {:s}'.format(seed['type'])) continue dataset_inserted(seed['name'], 'sp')
def get_pos_statistics(data_name, model_name, top_k=500): top = 100 if top_k <= 100 else 500 if top_k <= 500 else 1000 tmp_file = '-'.join([data_name, model_name, 'pos_ratio', str(top)]) + '.pkl' tmp_file = get_path(_tmp_dir, tmp_file) def cal_fn(): word_ids, tags = load_words_and_state(data_name, model_name, 'pos', diff=False) ids_tags = sort_by_id(word_ids, tags) tags_counters = [] for i, tags in enumerate(ids_tags): if tags is None: continue counter = Counter(tags) total = len(tags) for key, count in counter.items(): counter[key] = count / total tags_counters.append({'id': i, 'ratio': counter}) return tags_counters return maybe_calculate(tmp_file, cal_fn)[:top_k]
def _load_model(self, name, train=False): if name in self._available_models: config_file = get_path(_config_dir, self._available_models[name]['config']) model, train_config = build_model(config_file) model.add_generator() model.add_evaluator(1, 1, 100, True, log_gates=True, log_pos=True) if not train: # If not training, the model should already be trained assert_path_exists(get_path(_model_dir, model.name)) model.restore() self._models[name] = model self._train_configs[name] = train_config return True else: print('WARN: Cannot find model with name {:s}'.format(name)) return False
def get_config_filename(self, name): """ Get the config file path of a given model :param name: the name of the model, should be in _available_models :return: file path if the name is in _available_models, else None """ if name in self._available_models: return get_path(_config_dir, self._available_models[name]['config']) else: return None
def seed_db(force=False): """ Use the `config/datasets/lm.yml` to generate example datasets and store them into db. :return: None """ config_dir = get_path('config/db', 'lm.yml') with open(config_dir, 'r') as f: config = yaml.safe_load(f)['datasets'] for seed in config: data_dir = get_path('cached_data', seed['dir']) print('seeding {:s} data'.format(seed['name'])) if seed['type'] == 'ptb': store_ptb(data_dir, seed['name'], force) elif seed['type'] == 'text': seed['scheme'].update({'upsert': force}) store_plain_text(data_dir, seed['name'], **seed['scheme']) else: print('cannot find corresponding seed functions') continue dataset_inserted(seed['name'], 'lm', force)
def restore(self, path=None): if not self.finalized: self.finalize() path = path if path is not None else self.logdir checkpoint = tf.train.latest_checkpoint(path) # print(path) # print(checkpoint) self._saver.restore(self.sess, checkpoint) # with self.supervisor.managed_session() as sess: # self.supervisor.saver.restore(sess, checkpoint) print("Model variables restored from {}.".format( get_path(path, absolute=True)))
def get_state_signature(data_name, model_name, state_name, layer=None, sample_size=5000, dim=50): """ A helper function that sampled the states records, and maybe do PCA (if `sample size` is different from `dim`). The results will be cached on disk. :param data_name: str :param model_name: str :param state_name: str :param layer: start from 0 :param sample_size: :param dim: :return: """ if layer is not None: if not isinstance(layer, list): layer = [layer] layer_str = 'all' if layer is None else ''.join([str(l) for l in layer]) file_name = '-'.join([ data_name, model_name, state_name, 'all' if layer is None else layer_str, str(sample_size), str(dim) if dim is not None else str(sample_size) ]) + '.pkl' file_name = get_path(_tmp_dir, file_name) def cal_fn(layers): words, states = load_words_and_state(data_name, model_name, state_name, diff=False) layers = layers if layers is not None else list( range(states[0].shape[0])) state_layers = [] for l in layers: state_layers.append([state[l, :] for state in states]) states_mat = np.hstack(state_layers).T print("sampling") sample_idx = np.random.randint(0, states_mat.shape[1], sample_size) sample = states_mat[:, sample_idx] if dim is not None: print("doing PCA...") sample, variance = tsne.pca(sample, dim) print("PCA kept {:f}% of variance".format(variance * 100)) return sample return maybe_calculate(file_name, cal_fn, layer)
def save(self, path=None): """ Save the model to a given path :param path: :return: """ if not self.finalized: self.finalize() path = path if path is not None else os.path.join(self.logdir, 'model') before_save(path) # with self.sess as sess: # self.supervisor.saver.save(sess, path, global_step=self.supervisor.global_step) self._saver.save(self.sess, path) print("Model variables saved to {}.".format( get_path(path, absolute=True)))
def get_datasets_by_name(name, fields=None): complete_data = {} fields = ['word_to_id', 'id_to_word', 'train', 'valid', 'test'] if fields is None else fields for c_name in fields: if c_name in ['train', 'test', 'valid']: data = json.load(open(get_path(get_dataset_path(name), c_name))) complete_data[c_name] = data continue results = db_hdlr[c_name].find_one({'name': name}) if results is None: print('WARN: No data in collection {:s} of db {:s} named {:s}'.format(c_name, _db_name, name)) return None complete_data[c_name] = results['data'] if 'word_to_id' in complete_data: complete_data['word_to_id'] = json.loads(complete_data['word_to_id']) return complete_data
def __init__(self): with open(get_path('config', 'models.yml')) as f: try: self._available_models = yaml.safe_load(f) except: raise ValueError("Malformat of config file!") self._models = {} self._train_configs = {} self.record_flag = {} print("loading models...") for model_name in self._available_models.keys(): try: self._load_model(model_name) print("Model {:s} successfully loaded\n".format(model_name)) except: print("Failed to load model {:s}\n".format(model_name))
def load_words_and_state(data_name, model_name, state_name, diff=True): """ A wrapper function that wraps fetch_states and cached the results in .pkl file for latter use :param data_name: :param model_name: :param state_name: :param diff: :return: a pair of two list (word_list, states_list) """ states_file = data_name + '-' + model_name + '-' + 'words' + '-' + state_name + ( '-diff' if diff else '') + '.pkl' states_file = get_path(_tmp_dir, states_file) def cal_fn(): return fetch_states(data_name, model_name, state_name, diff) words, states = maybe_calculate(states_file, cal_fn) return words, states
def __init__(self, name="RNN", initializer=None, logdir=None, graph=None, word_to_id=None): """ :param name: a str, used to create variable scope :return: a empty RNN model """ self.name = name self.word_to_id = word_to_id self.initializer = initializer if initializer is not None else tf.random_uniform_initializer( -0.1, 0.1) self.input_shape = None self.input_dtype = None self.output_shape = None self.output_dtype = None self.target_shape = None self.target_dtype = None self._cell = None self.cell_list = [] self.trainer = None self.evaluator = None self.validator = None self.generator = None self.loss_func = None self.is_compiled = False self.embedding_size = None self.vocab_size = None self.target_size = None self.use_last_output = False # self.supervisor = None self._sess = None self._saver = None self._init_op = None self.models = [] self.graph = graph if isinstance(graph, tf.Graph) else tf.get_default_graph() self.logdir = logdir or get_path('./models', name) self._finalize = False
def load_sorted_words_states(data_name, model_name, state_name, diff=True): """ A wrapper function that wraps fetch_states and sort them according to ids, and cached the results in .pkl file for latter use :param data_name: :param model_name: :param state_name: :param diff: :return: a pair of two list (word_list, states_list) """ states_file = '-'.join([ data_name, model_name, 'words', state_name, 'sorted' ]) + ('-diff' if diff else '') + '.pkl' states_file = get_path(_tmp_dir, states_file) def cal_fn(): words, states = fetch_states(data_name, model_name, state_name, diff) return sort_by_id(words, states) id_states = maybe_calculate(states_file, cal_fn) return id_states
def get_empirical_strength(data_name, model_name, state_name, layer=-1, top_k=100): """ A helper function that wraps cal_empirical_strength and cached the results in .pkl file for latter use :param data_name: :param model_name: :param state_name: :param layer: specify a layer, start from 0 :param top_k: get the strength of the top k frequent words :return: a list of strength mat (np.ndarray) of shape [len(layer), state_size] """ if not isinstance(layer, list): layer = [layer] if top_k > 1000: raise ValueError( "selected words range too large, only support top 1000 frequent words!" ) top = 100 if top_k <= 100 else 500 if top_k <= 500 else 1000 tmp_file = '-'.join( [data_name, model_name, 'strength', state_name, str(top)]) + '.pkl' tmp_file = get_path(_tmp_dir, tmp_file) def cal_fn(): # words, states = load_words_and_state(data_name, model_name, state_name, diff=True) id_to_states = load_sorted_words_states(data_name, model_name, state_name, diff=True) return cal_empirical_strength( id_to_states[:top], lambda state_mat: np.mean(state_mat, axis=0)) id_strengths = maybe_calculate(tmp_file, cal_fn) return [id_strengths[i][layer] for i in range(top_k)]
def get_tsne_projection(data_name, model_name, state_name, layer=-1, sample_size=5000, dim=50, perplexity=40.0): """ A helper function that wraps get_state_signature and tsne_project, the results will be chached on disk for latter use. :param data_name: :param model_name: :param state_name: :param layer: :param sample_size: :param dim: :param perplexity: :return: """ assert isinstance(layer, int), "tsne projection of only one layer is reasonable" tmp_file = '-'.join([ data_name, model_name, state_name, 'tsne', str(layer), str(dim), str(int(perplexity)) ]) + '.pkl' tmp_file = get_path(_tmp_dir, tmp_file) def cal_fn(): sample = get_state_signature(data_name, model_name, state_name, layer, sample_size, dim) / 50 print('Start doing t-SNE...') return tsne_project(sample, perplexity, dim, lr=50) tsne_solution = maybe_calculate(tmp_file, cal_fn) return tsne_solution
def get_dataset_path(name): return get_path('_cached/datasets', name)
""" Helper function to download and processing Stanford Sentiment Treebank datasets """ import os from rnnvis.utils.io_utils import download, unzip, get_path sst_url = "http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip" def download_sst(path): """ Download zip file from url and extract all the files under path :param path: :return: """ print("downloading ") local_file = os.path.join(path, 'stanfordSentimentTreebank.zip') download(sst_url, local_file) unzip(local_file, path) if __name__ == '__main__': download_sst(get_path('cached_data/'))
def f(self): if self._f is None: self._f = h5py.File(get_path(_root_dir, self.file_name), self.mode) return self._f
# # create_animated_tsne(sample, 40.0, [600,600], init_dim=50, lr=50, max_iter=1000, path='test.mp4') ### # Scripts that calculate the mean ### strength_mat = get_empirical_strength(data_name, model_name, state_name, layer=-1, top_k=50) id_to_word = get_dataset(data_name, ['id_to_word'])['id_to_word'] word_list = id_to_word[:50] strength2json(strength_mat, word_list, path=get_path('_cached', 'gru-state-strength.json')) ### # scripts performing mds ### # sample = get_state_signature(data_name, model_name, state_name, [1], 5000, None) # dist = squareform(pdist(sample, 'euclidean')) # y, eigs = mds.mds(dist) # # color = np.vstack([ # np.tile(np.array(color_scheme[0], np.float32), (600, 1)) # ]) # fig, ax = plt.subplots(figsize=[6, 6]) # ax.scatter(y[:600, 0], y[:600, 1], 8, c=color[:600, :]) # plt.show()
""" Application of RNNVis """ from flask import Flask from flask_cors import CORS from rnnvis.server.model_manager import ModelManager from rnnvis.utils.io_utils import get_path path = get_path('frontend/dist/static', absolute=True) print("Static folder: {:s}".format(path)) app = Flask(__name__) app.config['FRONT_END_ROOT'] = get_path('frontend/dist', absolute=True) app.config['STATIC_FOLDER'] = get_path('frontend/dist/static', absolute=True) _manager = ModelManager() CORS(app) from rnnvis.server.routes import *
def get_state_statistics(data_name, model_name, state_name, diff=True, layer=-1, top_k=500, k=None): """ Get state statistics, i.e. states mean reaction, 25~75 reaction range, 9~91 reaction range regarding top_k words :param data_name: :param model_name: :param state_name: :param diff: :param layer: :param top_k: :return: a dict containing statistics: { 'mean': [top_k, n_states], 'low1': [top_k, n_states], 25% 'high1': [top_k, n_states], 75% 'low2': [top_k, n_states], 9% 'high2': [top_k, n_states], 91% 'sort_idx': [top_k, n_states], each row represents sorted idx of mean reaction of states w.r.t. a word 'freqs': [top_k,] frequency of each of the top_k words, 'words': [top_k,], a list of words. } """ if k is not None: # top_k = top_k if top_k > k else k start = (k // 100) * 100 end = (k // 100 + 1) * 100 else: start = 0 end = 100 if top_k <= 100 else 500 if top_k <= 500 else 1000 cal_range = range(start, end) tmp_file = '-'.join([data_name, model_name, state_name, 'statistics', str(start), str(end)]) \ + ('-diff' if diff else '') + '.pkl' tmp_file = get_path(_tmp_dir, tmp_file) def cal_fn(data_name_, model_name_, state_name_, diff_, range_): # _words, states = load_words_and_state(data_name_, model_name_, state_name_, diff_) id_to_states = load_sorted_words_states(data_name_, model_name_, state_name_, diff_) _words = get_datasets_by_name(data_name_, ['id_to_word'])['id_to_word'] words = [] state_shape = id_to_states[0][0].shape dtype = id_to_states[0][0].dtype layer_num = state_shape[0] stats_list = [] for i in range_: id_to_state = id_to_states[i] if id_to_state is None: # some words may be seen in test set states = [np.zeros(state_shape, dtype)] # use zeros as placeholder else: states = id_to_state stats_list.append(cal_state_statistics(states)) words.append(_words[i]) stats_layer_wise = [] for layer_ in range(layer_num): stats = {} for field in stats_list[0][layer_].keys(): value = np.vstack([stat[layer_][field] for stat in stats_list]) stats[field] = value stats['freqs'] = np.array([ len(id_state) if id_state is not None else 0 for id_state in id_to_states ]) stats_layer_wise.append(stats) return stats_layer_wise, words layer_wise_stats, words = maybe_calculate(tmp_file, cal_fn, data_name, model_name, state_name, diff, cal_range) stats = layer_wise_stats[layer] if k is None: # stats = {key: value[:(top_k)].tolist() for key, value in stats.items()} results = defaultdict(list) for i in range(end): if len(results['freqs']) == top_k: break if stats['freqs'][i] == 0: continue for key, value in stats.items(): results[key].append(value[i].tolist()) results['words'].append(words[i]) else: results = { key: value[k - start].tolist() for key, value in stats.items() } results['words'] = words[k - start] return results
""" if name is None: name = self.file_path ids = name + '.ids.csv' print('saving ids to {:s}'.format(ids)) lists2csv(self.ids, ids, delimiter=' ') dictionary = name + '.dict.csv' print('saving dictionary to {:s}'.format(dictionary)) lists2csv([[word, i] for word, i in self.word_to_id.items()], dictionary, " ", encoding='utf-8') if __name__ == "__main__": # processor = PlainTextProcessor('../../cached_data/tinyshakespeare.txt') # processor.tag_rare_word(2, 10000) # processor.save() processor = SSTProcessor( get_path('./cached_data/stanfordSentimentTreebank', 'datasetSentences.txt'), get_path('./cached_data/stanfordSentimentTreebank', 'dictionary.txt'), get_path('./cached_data/stanfordSentimentTreebank', 'sentiment_lables.txt')) # tokens = processor.tokens # processor.ids = None # word_to_id = processor.word_to_id # sentences = processor.sentence_tokens processor.save()
# n_str = '-'.join([str(n_cluster) for n_cluster in n_clusters]) method = 'cocluster' n_clusters = 20 n_str = str(n_clusters) top_k = 600 results = get_co_cluster(data_name, model_name, state_name, n_clusters, -1, top_k, mode=mode, method=method) data, row_labels, col_labels = results[:3] id_to_word = get_dataset(data_name, ['id_to_word'])['id_to_word'] word_list = id_to_word[:top_k] row_sort_idx = np.argsort(row_labels) col_sort_idx = np.argsort(col_labels) mat = data[row_sort_idx] mat = mat[:, col_sort_idx] print(mat.shape) matshow( mat, col_sort_idx, [word_list[i] for i in row_sort_idx], get_path('co-cluster' + '-' + method + '-' + mode + '-' + n_str + '.png')) # matshow(data, None, word_list, get_path('co-cluster' + '-' + 'raw' + '.png')) # plt.show()