Exemple #1
0
def seed_db(force=False):
    """
    Use the `config/datasets/lm.yml` to generate example datasets and store them into db.
    :return: None
    """
    config_dir = get_path('config/db', 'sp.yml')
    with open(config_dir, 'r') as f:
        config = yaml.safe_load(f)['datasets']
    for seed in config:
        print('seeding {:s} data'.format(seed['name']))
        data_dir = get_path('cached_data', seed['dir'])
        if not path_exists(data_dir):
            print('Directory "{:s}" not available!'.format(data_dir))
            continue
        seed['scheme'].update({'upsert': force})
        if seed['type'] == 'sst':
            store_sst(data_dir, seed['name'], **seed['scheme'])
        elif seed['type'] == 'imdb':
            store_imdb(data_dir, seed['name'], **seed['scheme'])
        elif seed['type'] == 'yelp':
            store_yelp(data_dir, seed['name'], **seed['scheme'])
        else:
            print('not able to seed datasets with type: {:s}'.format(seed['type']))
            continue
        dataset_inserted(seed['name'], 'sp')
Exemple #2
0
def get_pos_statistics(data_name, model_name, top_k=500):
    top = 100 if top_k <= 100 else 500 if top_k <= 500 else 1000

    tmp_file = '-'.join([data_name, model_name, 'pos_ratio',
                         str(top)]) + '.pkl'
    tmp_file = get_path(_tmp_dir, tmp_file)

    def cal_fn():
        word_ids, tags = load_words_and_state(data_name,
                                              model_name,
                                              'pos',
                                              diff=False)
        ids_tags = sort_by_id(word_ids, tags)
        tags_counters = []
        for i, tags in enumerate(ids_tags):
            if tags is None:
                continue
            counter = Counter(tags)
            total = len(tags)
            for key, count in counter.items():
                counter[key] = count / total
            tags_counters.append({'id': i, 'ratio': counter})
        return tags_counters

    return maybe_calculate(tmp_file, cal_fn)[:top_k]
Exemple #3
0
 def _load_model(self, name, train=False):
     if name in self._available_models:
         config_file = get_path(_config_dir,
                                self._available_models[name]['config'])
         model, train_config = build_model(config_file)
         model.add_generator()
         model.add_evaluator(1, 1, 100, True, log_gates=True, log_pos=True)
         if not train:
             # If not training, the model should already be trained
             assert_path_exists(get_path(_model_dir, model.name))
             model.restore()
         self._models[name] = model
         self._train_configs[name] = train_config
         return True
     else:
         print('WARN: Cannot find model with name {:s}'.format(name))
         return False
Exemple #4
0
 def get_config_filename(self, name):
     """
     Get the config file path of a given model
     :param name: the name of the model, should be in _available_models
     :return: file path if the name is in _available_models, else None
     """
     if name in self._available_models:
         return get_path(_config_dir,
                         self._available_models[name]['config'])
     else:
         return None
Exemple #5
0
def seed_db(force=False):
    """
    Use the `config/datasets/lm.yml` to generate example datasets and store them into db.
    :return: None
    """
    config_dir = get_path('config/db', 'lm.yml')
    with open(config_dir, 'r') as f:
        config = yaml.safe_load(f)['datasets']
    for seed in config:
        data_dir = get_path('cached_data', seed['dir'])
        print('seeding {:s} data'.format(seed['name']))
        if seed['type'] == 'ptb':
            store_ptb(data_dir, seed['name'], force)
        elif seed['type'] == 'text':
            seed['scheme'].update({'upsert': force})
            store_plain_text(data_dir, seed['name'], **seed['scheme'])
        else:
            print('cannot find corresponding seed functions')
            continue
        dataset_inserted(seed['name'], 'lm', force)
Exemple #6
0
 def restore(self, path=None):
     if not self.finalized:
         self.finalize()
     path = path if path is not None else self.logdir
     checkpoint = tf.train.latest_checkpoint(path)
     # print(path)
     # print(checkpoint)
     self._saver.restore(self.sess, checkpoint)
     # with self.supervisor.managed_session() as sess:
     #     self.supervisor.saver.restore(sess, checkpoint)
     print("Model variables restored from {}.".format(
         get_path(path, absolute=True)))
Exemple #7
0
def get_state_signature(data_name,
                        model_name,
                        state_name,
                        layer=None,
                        sample_size=5000,
                        dim=50):
    """
    A helper function that sampled the states records,
        and maybe do PCA (if `sample size` is different from `dim`).
        The results will be cached on disk.
    :param data_name: str
    :param model_name: str
    :param state_name: str
    :param layer: start from 0
    :param sample_size:
    :param dim:
    :return:
    """
    if layer is not None:
        if not isinstance(layer, list):
            layer = [layer]
    layer_str = 'all' if layer is None else ''.join([str(l) for l in layer])
    file_name = '-'.join([
        data_name, model_name, state_name,
        'all' if layer is None else layer_str,
        str(sample_size),
        str(dim) if dim is not None else str(sample_size)
    ]) + '.pkl'
    file_name = get_path(_tmp_dir, file_name)

    def cal_fn(layers):
        words, states = load_words_and_state(data_name,
                                             model_name,
                                             state_name,
                                             diff=False)
        layers = layers if layers is not None else list(
            range(states[0].shape[0]))
        state_layers = []
        for l in layers:
            state_layers.append([state[l, :] for state in states])
        states_mat = np.hstack(state_layers).T
        print("sampling")
        sample_idx = np.random.randint(0, states_mat.shape[1], sample_size)
        sample = states_mat[:, sample_idx]
        if dim is not None:
            print("doing PCA...")
            sample, variance = tsne.pca(sample, dim)
            print("PCA kept {:f}% of variance".format(variance * 100))
        return sample

    return maybe_calculate(file_name, cal_fn, layer)
Exemple #8
0
 def save(self, path=None):
     """
     Save the model to a given path
     :param path:
     :return:
     """
     if not self.finalized:
         self.finalize()
     path = path if path is not None else os.path.join(self.logdir, 'model')
     before_save(path)
     # with self.sess as sess:
     #     self.supervisor.saver.save(sess, path, global_step=self.supervisor.global_step)
     self._saver.save(self.sess, path)
     print("Model variables saved to {}.".format(
         get_path(path, absolute=True)))
Exemple #9
0
def get_datasets_by_name(name, fields=None):
    complete_data = {}
    fields = ['word_to_id', 'id_to_word', 'train', 'valid', 'test'] if fields is None else fields
    for c_name in fields:
        if c_name in ['train', 'test', 'valid']:
            data = json.load(open(get_path(get_dataset_path(name), c_name)))
            complete_data[c_name] = data
            continue
        results = db_hdlr[c_name].find_one({'name': name})
        if results is None:
            print('WARN: No data in collection {:s} of db {:s} named {:s}'.format(c_name, _db_name, name))
            return None
        complete_data[c_name] = results['data']
    if 'word_to_id' in complete_data:
        complete_data['word_to_id'] = json.loads(complete_data['word_to_id'])
    return complete_data
Exemple #10
0
 def __init__(self):
     with open(get_path('config', 'models.yml')) as f:
         try:
             self._available_models = yaml.safe_load(f)
         except:
             raise ValueError("Malformat of config file!")
     self._models = {}
     self._train_configs = {}
     self.record_flag = {}
     print("loading models...")
     for model_name in self._available_models.keys():
         try:
             self._load_model(model_name)
             print("Model {:s} successfully loaded\n".format(model_name))
         except:
             print("Failed to load model {:s}\n".format(model_name))
Exemple #11
0
def load_words_and_state(data_name, model_name, state_name, diff=True):
    """
    A wrapper function that wraps fetch_states and cached the results in .pkl file for latter use
    :param data_name:
    :param model_name:
    :param state_name:
    :param diff:
    :return: a pair of two list (word_list, states_list)
    """

    states_file = data_name + '-' + model_name + '-' + 'words' + '-' + state_name + (
        '-diff' if diff else '') + '.pkl'
    states_file = get_path(_tmp_dir, states_file)

    def cal_fn():
        return fetch_states(data_name, model_name, state_name, diff)

    words, states = maybe_calculate(states_file, cal_fn)
    return words, states
Exemple #12
0
    def __init__(self,
                 name="RNN",
                 initializer=None,
                 logdir=None,
                 graph=None,
                 word_to_id=None):
        """
        :param name: a str, used to create variable scope
        :return: a empty RNN model
        """
        self.name = name
        self.word_to_id = word_to_id
        self.initializer = initializer if initializer is not None else tf.random_uniform_initializer(
            -0.1, 0.1)
        self.input_shape = None
        self.input_dtype = None
        self.output_shape = None
        self.output_dtype = None
        self.target_shape = None
        self.target_dtype = None

        self._cell = None
        self.cell_list = []
        self.trainer = None
        self.evaluator = None
        self.validator = None
        self.generator = None
        self.loss_func = None
        self.is_compiled = False
        self.embedding_size = None
        self.vocab_size = None
        self.target_size = None
        self.use_last_output = False
        # self.supervisor = None
        self._sess = None
        self._saver = None
        self._init_op = None
        self.models = []
        self.graph = graph if isinstance(graph,
                                         tf.Graph) else tf.get_default_graph()
        self.logdir = logdir or get_path('./models', name)
        self._finalize = False
Exemple #13
0
def load_sorted_words_states(data_name, model_name, state_name, diff=True):
    """
    A wrapper function that wraps fetch_states and sort them according to ids,
        and cached the results in .pkl file for latter use
    :param data_name:
    :param model_name:
    :param state_name:
    :param diff:
    :return: a pair of two list (word_list, states_list)
    """
    states_file = '-'.join([
        data_name, model_name, 'words', state_name, 'sorted'
    ]) + ('-diff' if diff else '') + '.pkl'
    states_file = get_path(_tmp_dir, states_file)

    def cal_fn():
        words, states = fetch_states(data_name, model_name, state_name, diff)
        return sort_by_id(words, states)

    id_states = maybe_calculate(states_file, cal_fn)
    return id_states
Exemple #14
0
def get_empirical_strength(data_name,
                           model_name,
                           state_name,
                           layer=-1,
                           top_k=100):
    """
    A helper function that wraps cal_empirical_strength and cached the results in .pkl file for latter use
    :param data_name:
    :param model_name:
    :param state_name:
    :param layer: specify a layer, start from 0
    :param top_k: get the strength of the top k frequent words
    :return: a list of strength mat (np.ndarray) of shape [len(layer), state_size]
    """
    if not isinstance(layer, list):
        layer = [layer]
    if top_k > 1000:
        raise ValueError(
            "selected words range too large, only support top 1000 frequent words!"
        )
    top = 100 if top_k <= 100 else 500 if top_k <= 500 else 1000
    tmp_file = '-'.join(
        [data_name, model_name, 'strength', state_name,
         str(top)]) + '.pkl'
    tmp_file = get_path(_tmp_dir, tmp_file)

    def cal_fn():
        # words, states = load_words_and_state(data_name, model_name, state_name, diff=True)
        id_to_states = load_sorted_words_states(data_name,
                                                model_name,
                                                state_name,
                                                diff=True)
        return cal_empirical_strength(
            id_to_states[:top], lambda state_mat: np.mean(state_mat, axis=0))

    id_strengths = maybe_calculate(tmp_file, cal_fn)

    return [id_strengths[i][layer] for i in range(top_k)]
Exemple #15
0
def get_tsne_projection(data_name,
                        model_name,
                        state_name,
                        layer=-1,
                        sample_size=5000,
                        dim=50,
                        perplexity=40.0):
    """
    A helper function that wraps get_state_signature and tsne_project,
        the results will be chached on disk for latter use.
    :param data_name:
    :param model_name:
    :param state_name:
    :param layer:
    :param sample_size:
    :param dim:
    :param perplexity:
    :return:
    """
    assert isinstance(layer,
                      int), "tsne projection of only one layer is reasonable"
    tmp_file = '-'.join([
        data_name, model_name, state_name, 'tsne',
        str(layer),
        str(dim),
        str(int(perplexity))
    ]) + '.pkl'
    tmp_file = get_path(_tmp_dir, tmp_file)

    def cal_fn():
        sample = get_state_signature(data_name, model_name, state_name, layer,
                                     sample_size, dim) / 50
        print('Start doing t-SNE...')
        return tsne_project(sample, perplexity, dim, lr=50)

    tsne_solution = maybe_calculate(tmp_file, cal_fn)
    return tsne_solution
Exemple #16
0
def get_dataset_path(name):
    return get_path('_cached/datasets', name)
Exemple #17
0
"""
Helper function to download and processing Stanford Sentiment Treebank datasets
"""

import os
from rnnvis.utils.io_utils import download, unzip, get_path

sst_url = "http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip"


def download_sst(path):
    """
    Download zip file from url and extract all the files under path
    :param path:
    :return:
    """
    print("downloading ")
    local_file = os.path.join(path, 'stanfordSentimentTreebank.zip')
    download(sst_url, local_file)
    unzip(local_file, path)


if __name__ == '__main__':
    download_sst(get_path('cached_data/'))
Exemple #18
0
 def f(self):
     if self._f is None:
         self._f = h5py.File(get_path(_root_dir, self.file_name), self.mode)
     return self._f
Exemple #19
0
    #
    # create_animated_tsne(sample, 40.0, [600,600], init_dim=50, lr=50, max_iter=1000, path='test.mp4')

    ###
    # Scripts that calculate the mean
    ###
    strength_mat = get_empirical_strength(data_name,
                                          model_name,
                                          state_name,
                                          layer=-1,
                                          top_k=50)
    id_to_word = get_dataset(data_name, ['id_to_word'])['id_to_word']
    word_list = id_to_word[:50]
    strength2json(strength_mat,
                  word_list,
                  path=get_path('_cached', 'gru-state-strength.json'))

    ###
    # scripts performing mds
    ###
    # sample = get_state_signature(data_name, model_name, state_name, [1], 5000, None)
    # dist = squareform(pdist(sample, 'euclidean'))
    # y, eigs = mds.mds(dist)
    #
    # color = np.vstack([
    #     np.tile(np.array(color_scheme[0], np.float32), (600, 1))
    # ])
    # fig, ax = plt.subplots(figsize=[6, 6])
    # ax.scatter(y[:600, 0], y[:600, 1], 8, c=color[:600, :])
    # plt.show()
Exemple #20
0
"""
Application of RNNVis
"""
from flask import Flask
from flask_cors import CORS
from rnnvis.server.model_manager import ModelManager
from rnnvis.utils.io_utils import get_path
path = get_path('frontend/dist/static', absolute=True)
print("Static folder: {:s}".format(path))
app = Flask(__name__)
app.config['FRONT_END_ROOT'] = get_path('frontend/dist', absolute=True)
app.config['STATIC_FOLDER'] = get_path('frontend/dist/static', absolute=True)
_manager = ModelManager()
CORS(app)
from rnnvis.server.routes import *
Exemple #21
0
def get_state_statistics(data_name,
                         model_name,
                         state_name,
                         diff=True,
                         layer=-1,
                         top_k=500,
                         k=None):
    """
    Get state statistics, i.e. states mean reaction, 25~75 reaction range, 9~91 reaction range regarding top_k words
    :param data_name:
    :param model_name:
    :param state_name:
    :param diff:
    :param layer:
    :param top_k:
    :return: a dict containing statistics:
        {
            'mean': [top_k, n_states],
            'low1': [top_k, n_states], 25%
            'high1': [top_k, n_states], 75%
            'low2': [top_k, n_states], 9%
            'high2': [top_k, n_states], 91%
            'sort_idx': [top_k, n_states], each row represents sorted idx of mean reaction of states w.r.t. a word
            'freqs': [top_k,] frequency of each of the top_k words,
            'words': [top_k,], a list of words.
        }
    """
    if k is not None:
        # top_k = top_k if top_k > k else k
        start = (k // 100) * 100
        end = (k // 100 + 1) * 100
    else:
        start = 0
        end = 100 if top_k <= 100 else 500 if top_k <= 500 else 1000
    cal_range = range(start, end)

    tmp_file = '-'.join([data_name, model_name, state_name, 'statistics', str(start), str(end)]) \
               + ('-diff' if diff else '') + '.pkl'
    tmp_file = get_path(_tmp_dir, tmp_file)

    def cal_fn(data_name_, model_name_, state_name_, diff_, range_):
        # _words, states = load_words_and_state(data_name_, model_name_, state_name_, diff_)
        id_to_states = load_sorted_words_states(data_name_, model_name_,
                                                state_name_, diff_)
        _words = get_datasets_by_name(data_name_, ['id_to_word'])['id_to_word']
        words = []
        state_shape = id_to_states[0][0].shape
        dtype = id_to_states[0][0].dtype
        layer_num = state_shape[0]
        stats_list = []
        for i in range_:
            id_to_state = id_to_states[i]
            if id_to_state is None:  # some words may be seen in test set
                states = [np.zeros(state_shape,
                                   dtype)]  # use zeros as placeholder
            else:
                states = id_to_state
            stats_list.append(cal_state_statistics(states))
            words.append(_words[i])
        stats_layer_wise = []
        for layer_ in range(layer_num):
            stats = {}
            for field in stats_list[0][layer_].keys():
                value = np.vstack([stat[layer_][field] for stat in stats_list])
                stats[field] = value
            stats['freqs'] = np.array([
                len(id_state) if id_state is not None else 0
                for id_state in id_to_states
            ])
            stats_layer_wise.append(stats)
        return stats_layer_wise, words

    layer_wise_stats, words = maybe_calculate(tmp_file, cal_fn, data_name,
                                              model_name, state_name, diff,
                                              cal_range)
    stats = layer_wise_stats[layer]
    if k is None:
        # stats = {key: value[:(top_k)].tolist() for key, value in stats.items()}
        results = defaultdict(list)
        for i in range(end):
            if len(results['freqs']) == top_k:
                break
            if stats['freqs'][i] == 0:
                continue
            for key, value in stats.items():
                results[key].append(value[i].tolist())
            results['words'].append(words[i])

    else:
        results = {
            key: value[k - start].tolist()
            for key, value in stats.items()
        }
        results['words'] = words[k - start]
    return results
Exemple #22
0
        """
        if name is None:
            name = self.file_path
        ids = name + '.ids.csv'
        print('saving ids to {:s}'.format(ids))
        lists2csv(self.ids, ids, delimiter=' ')
        dictionary = name + '.dict.csv'
        print('saving dictionary to {:s}'.format(dictionary))
        lists2csv([[word, i] for word, i in self.word_to_id.items()],
                  dictionary,
                  " ",
                  encoding='utf-8')


if __name__ == "__main__":

    # processor = PlainTextProcessor('../../cached_data/tinyshakespeare.txt')
    # processor.tag_rare_word(2, 10000)
    # processor.save()
    processor = SSTProcessor(
        get_path('./cached_data/stanfordSentimentTreebank',
                 'datasetSentences.txt'),
        get_path('./cached_data/stanfordSentimentTreebank', 'dictionary.txt'),
        get_path('./cached_data/stanfordSentimentTreebank',
                 'sentiment_lables.txt'))
    # tokens = processor.tokens
    # processor.ids = None
    # word_to_id = processor.word_to_id
    # sentences = processor.sentence_tokens
    processor.save()
    # n_str = '-'.join([str(n_cluster) for n_cluster in n_clusters])
    method = 'cocluster'
    n_clusters = 20
    n_str = str(n_clusters)
    top_k = 600

    results = get_co_cluster(data_name,
                             model_name,
                             state_name,
                             n_clusters,
                             -1,
                             top_k,
                             mode=mode,
                             method=method)

    data, row_labels, col_labels = results[:3]
    id_to_word = get_dataset(data_name, ['id_to_word'])['id_to_word']
    word_list = id_to_word[:top_k]
    row_sort_idx = np.argsort(row_labels)
    col_sort_idx = np.argsort(col_labels)
    mat = data[row_sort_idx]
    mat = mat[:, col_sort_idx]
    print(mat.shape)
    matshow(
        mat, col_sort_idx, [word_list[i] for i in row_sort_idx],
        get_path('co-cluster' + '-' + method + '-' + mode + '-' + n_str +
                 '.png'))

    # matshow(data, None, word_list, get_path('co-cluster' + '-' + 'raw' + '.png'))
    # plt.show()