def store_dataset_by_default(name, data_dict, force=False): dataset_path = get_dataset_path(name) for field, data in data_dict.items(): target_path = os.path.join(dataset_path, field) if file_exists(target_path): if force: dict2json(data, target_path) print("{:s} data already exists, overwritten.".format(field)) else: print("{:s} data already exists, if you want to overwrite, use force!".format(field)) else: dict2json(data, target_path)
def solution2json(solution, states_num, labels=None, path=None): """ Convert the tsne solution to json format :param solution: :param states_num: a list specifying number of states in each layer, should add up the the solution size :param labels: additional labels for each states :param path: :return: """ if isinstance(solution, np.ndarray): solution = solution.tolist() if isinstance(labels, np.ndarray): labels = labels.tolist() if labels is None: labels = [0] * len(solution) layers = [] state_ids = [] for i, num in enumerate(states_num): layers += [i + 1] * num state_ids += list(range(num)) points = [{ 'coords': s, 'layer': layers[i], 'state_id': state_ids[i], 'label': labels[i] } for i, s in enumerate(solution)] return dict2json(points, path)
def strength2json(strength_list, words, labels=None, path=None): """ A helper function that convert the results of get_empirical_strength to standard format to serve the web request. :param strength_list: a list of ndarray (n_layer, n_states) :param words: word (str) for each strength :param labels: additional labels :param path: saving path :return: """ if labels is None: labels = [0] * len(strength_list) points = [{ 'word': words[i], 'strength': strength.tolist(), 'label': labels[i] } for i, strength in enumerate(strength_list)] return dict2json(points, path)
def store_sst(data_path, name, split_scheme, upsert=False): """ Process and store the ptb datasets to db :param data_path: :param name: :param upsert: :return: """ if not path_exists(data_path): download_sst(os.path.abspath(os.path.join(data_path, '../'))) if upsert: insertion = replace_one_if_exists else: insertion = insert_one_if_not_exists phrase_path = os.path.join(data_path, "dictionary.txt") sentence_path = os.path.join(data_path, "datasetSentences.txt") label_path = os.path.join(data_path, "sentiment_labels.txt") sentence_split_path = os.path.join(data_path, "datasetSplit.txt") processor = SSTProcessor(sentence_path, phrase_path, label_path, sentence_split_path) split_data = split(list(zip(processor.ids, processor.labels, range(1, len(processor.labels)+1))), split_scheme.values(), shuffle=True) split_data = dict(zip(split_scheme.keys(), split_data)) sentence_data_ids = processor.split_sentence_ids word_to_id_json = dict2json(processor.word_to_id) insertion('word_to_id', {'name': name}, {'name': name, 'data': word_to_id_json}) insertion('id_to_word', {'name': name}, {'name': name, 'data': processor.id_to_word}) for i, set_name in enumerate(['train', 'valid', 'test']): data, ids = zip(*(sentence_data_ids[i])) insertion('sentences', {'name': name, 'set': set_name}, {'name': name, 'set': set_name, 'data': data, 'ids': ids}) if 'train' not in split_data: print('WARN: there is no train data in the split data!') data_dict = {} for set_name in ['train', 'valid', 'test']: if set_name in split_data: data, label, ids = zip(*split_data[set_name]) # convert label to 1,2,3,4,5 label = [float(i) for i in label] label = [(0 if i <= 0.2 else 1 if i <= 0.4 else 2 if i <= 0.6 else 3 if i <= 0.8 else 4) for i in label] data_dict[set_name] = {'data': data, 'label': label, 'ids': ids} store_dataset_by_default(name, data_dict, upsert)
def store_imdb(data_path, name, n_words=100000, upsert=False): if upsert: insertion = replace_one_if_exists else: insertion = insert_one_if_not_exists word_to_id, id_to_word = imdb.load_dict(os.path.join(data_path, 'imdb.dict.pkl.gz'), n_words) data_label = imdb.load_data(os.path.join(data_path, 'imdb.pkl'), n_words) word_to_id_json = dict2json(word_to_id) insertion('word_to_id', {'name': name}, {'name': name, 'data': word_to_id_json}) insertion('id_to_word', {'name': name}, {'name': name, 'data': id_to_word}) data_dict = {} for i, set_name in enumerate(['train', 'valid', 'test']): data, label = data_label[i] ids = list(range(len(data))) # insertion('sentences', {'name': name, 'set': set_name}, # {'name': name, 'set': set_name, 'data': data, 'label': label, 'ids': ids}) data_dict[set_name] = {'data': data, 'label': label, 'ids': ids} store_dataset_by_default(name, data_dict, upsert)
def store_ptb(data_path, name='ptb', upsert=False): """ Process and store the ptb datasets to db :param data_path: :param name: :param upsert: :return: """ if upsert: if get_datasets_by_name(name, ['word_to_id']) is not None: print('dataset {:s} already exists. overwriting...'.format(name)) elif get_datasets_by_name(name, ['word_to_id']) is not None: print( 'dataset {:s} already exists. skipped. If you want to overwrite, use force!' .format(name)) return train_path = os.path.join(data_path, "ptb.train.txt") valid_path = os.path.join(data_path, "ptb.valid.txt") test_path = os.path.join(data_path, "ptb.test.txt") paths = [train_path, valid_path, test_path] data_list, word_to_id, id_to_word = load_data_as_ids(paths) tag_list = load_data_as_pos_tags(paths) train_tags, valid_tags, test_tags = tag_list train, valid, test = data_list word_to_id_json = dict2json(word_to_id) if upsert: insertion = replace_one_if_exists else: insertion = insert_one_if_not_exists insertion('word_to_id', {'name': name}, { 'name': name, 'data': word_to_id_json }) insertion('id_to_word', {'name': name}, {'name': name, 'data': id_to_word}) data_dict = { 'train': { 'data': train }, 'valid': { 'data': valid }, 'test': { 'data': test } } tags_dict = { 'train_pos': { 'data': train_tags }, 'valid_pos': { 'data': valid_tags }, 'test_pos': { 'data': test_tags } } id_to_pos = [] for i, data in enumerate(data_list): tags = tag_list[i] arr = [None] * len(id_to_word) id_to_pos.append(data) store_dataset_by_default(name, data_dict, upsert) store_dataset_by_default(name, tags_dict, upsert)
def store_plain_text(data_path, name, split_scheme, min_freq=1, max_vocab=10000, remove_punct=False, upsert=False): """ Process any plain text and store to db :param data_path: :param name: :param split_scheme: :param min_freq: :param max_vocab: :param upsert: :return: """ if upsert: if get_datasets_by_name(name, ['word_to_id']) is not None: print('dataset {:s} already exists. overwriting...'.format(name)) elif get_datasets_by_name(name, ['word_to_id']) is not None: print( 'dataset {:s} already exists. skipped. If you want to overwrite, use force!' .format(name)) return if upsert: insertion = replace_one_if_exists else: insertion = insert_one_if_not_exists processor = PlainTextProcessor(data_path, remove_punct=remove_punct) processor.tag_rare_word(min_freq, max_vocab) split_ids_tags = split( list(zip(processor.flat_ids, processor.flat_pos_tags)), split_scheme.values()) split_ids = [] split_tags = [] for split_id_tag in split_ids_tags: ids, tags = zip(*split_id_tag) split_ids.append(ids) split_tags.append(tags) split_data = dict(zip(split_scheme.keys(), split_ids)) split_tags = dict(zip(split_scheme.keys(), split_tags)) if 'train' not in split_data: print('WARN: there is no train data in the split data!') data_dict = {} for set_name in ['train', 'valid', 'test']: if set_name in split_data: data_dict[set_name] = {'data': split_data[set_name]} tags_dict = {} for set_name in ['train', 'valid', 'test']: if set_name in split_tags: tags_dict[set_name + '_pos'] = {'data': split_tags[set_name]} store_dataset_by_default(name, data_dict, upsert) store_dataset_by_default(name, tags_dict, upsert) insertion('word_to_id', {'name': name}, { 'name': name, 'data': dict2json(processor.word_to_id) }) insertion('id_to_word', {'name': name}, { 'name': name, 'data': processor.id_to_word })
def store_yelp(data_path, name, n_words=10000, upsert=False): if upsert: insertion = replace_one_if_exists else: insertion = insert_one_if_not_exists with open(os.path.join(data_path, 'review_label.json'), 'r') as file: data = json.load(file) if name == 'yelp-2': tmp_ = [] for item in data: if item['label'] == 3: continue item['label'] = 0 if item['label'] < 3 else 1 tmp_.append(item) data = tmp_ training_data, validate_data, test_data = split(data, fractions=[0.8, 0.1, 0.1], shuffle=True) all_words = [] reviews = [] stars = [] for item in training_data: tokenized_review = list(itertools.chain.from_iterable(tokenize(item['review'], remove_punct=True)[0])) reviews.append(tokenized_review) stars.append(item['label']) all_words.extend(tokenized_review) # for w in all_words: # if isinstance(w, list): # print("found a list" + str(w)) word_to_id, counter, words = tokens2vocab(all_words) n_words -= 1 word_to_id = {k: v+1 for k, v in word_to_id.items() if v < n_words} word_to_id['<unk>'] = 0 id_to_word = [None] * len(word_to_id) for word, id_ in word_to_id.items(): id_to_word[id_] = word reviews = [[word_to_id[t] if word_to_id.get(t) else 0 for t in sentence] for sentence in reviews] training_data = (reviews, stars) tmp_data = [] for _data in [validate_data, test_data]: reviews = [] stars = [] for item in _data: tokenized_review = list(itertools.chain.from_iterable(tokenize(item['review'])[0])) reviews.append([word_to_id[t] if word_to_id.get(t) else 0 for t in tokenized_review]) stars.append(item['label']) tmp_data.append((reviews, stars)) validate_data = tmp_data[0] test_data = tmp_data[1] word_to_id_json = dict2json(word_to_id) insertion('word_to_id', {'name': name}, {'name': name, 'data': word_to_id_json}) insertion('id_to_word', {'name': name}, {'name': name, 'data': id_to_word}) data_names = ['train', 'valid', 'test'] data_dict = {} for i, data_set in enumerate([training_data, validate_data, test_data]): data_set = tuple(zip(*sorted(zip(*data_set), key=lambda x: len(x[0])))) data, label = data_set ids = list(range(len(data))) data_dict[data_names[i]] = {'data': data, 'label': label, 'ids': ids} insertion('sentences', {'name': name, 'set': data_names[i]}, {'name': name, 'set': data_names[i], 'data': data, 'label': label, 'ids': ids}) store_dataset_by_default(name, data_dict, upsert)
def generate(self, sess, seeds, logdir=None, max_branch=3, accum_cond_prob=0.9, min_cond_prob=0.1, min_prob=0.001, max_step=10, neg_word_ids=None): """ Generate sequence tree with given seed (a word_id) and certain requirements Note that the method always try to generate as much branches as possible. :param sess: the sess to run the model :param seeds: a list of word_id or a list of words :param logdir: the file path to save the generating tree :param max_branch: the maximum number of branches at each node :param accum_cond_prob: the maximum accumulate conditional probability of the following branches :param min_cond_prob: the minimum conditional probability of each branch :param min_prob: the minimum probability of a branch (note that this indicates a multiplication along the tree) :param max_step: the step to generate :param neg_word_ids: a set of neglected words or words' ids. :return: if logdir is None, returns a dict object representing the tree. if logdir is not None, return None """ model = self.model model.reset_state() # Initialize the tree and inserts the seeds node tree = Tree() # converts words into ids if (not isinstance(seeds, list)) or len(seeds) < 1: raise ValueError("seeds should be a list of words or ids") if isinstance(seeds[0], str): _seeds = self.get_id_from_word(seeds) seeds = _seeds parent = GenerateNode(seeds[0], 1.0, 1.0) tree.add_node(parent, None) for seed in seeds[1:]: node = GenerateNode(seed, 1.0, 1.0) tree.add_node(node, parent) parent = node if neg_word_ids is None: neg_word_ids = {} elif isinstance(neg_word_ids[0], str): neg_word_ids = self.get_id_from_word(neg_word_ids) elif not isinstance(neg_word_ids[0], int): raise TypeError("neg_word_ids should be a iterable object containing words as word tokens or word ids!") neg_word_ids = set(neg_word_ids) # converts to set for easier `in` statement # print(neg_word_ids) buffer_size = self.model.batch_size def _generate(_buffer, step): if step > max_step: # Already at the maximum generating step return if len(_buffer) > buffer_size: _b = [] for j in range(0, len(_buffer), buffer_size): _b += _generate(_buffer[j:(j+buffer_size)], step) return _b nodes, states = zip(*_buffer) word_ids = [n.word_id for n in nodes] prev_probs = [n.prob for n in nodes] states = _pack_list_to_states(states, buffer_size) # padding -1s to make sure that shape match word_ids += [-1] * (buffer_size - len(word_ids)) # prev_prob = node.prob # The second inputs is just to hold place. See the implementation of model.run() model.current_state = states evals, _ = model.run(np.array(word_ids).reshape(buffer_size, 1), None, 1, sess, eval_ops={'projected': model.projected_outputs}) new_buffer = [] # shape: [batch_size * num_steps, project_size] batch_outputs = evals['projected'][0] current_states = _convert_state_to_list(model.current_state, len(_buffer)) def _filter_and_append(outputs, pos): # do softmax so that outputs represents probs outputs = losses.softmax(outputs) # Get sorted k max probs and their ids, # since we will neglect some of them latter, we first get a bit more of the top k max_id = np.argpartition(-outputs, max_branch)[:(max_branch+len(neg_word_ids))] del_ids = [] for i, id_ in enumerate(max_id): if int(id_) in neg_word_ids: del_ids.append(i) max_id = np.delete(max_id, del_ids) max_id = max_id[:max_branch] cond_probs = outputs[max_id] # Sort the cond_probs for later filtering use sort_indice = np.argsort(-cond_probs) max_id = max_id[sort_indice] cond_probs = cond_probs[sort_indice] prob_sum = np.sum(cond_probs) # do filtering according to accum_prob while len(cond_probs) > 0: if accum_cond_prob > prob_sum: break prob_sum -= cond_probs[-1] cond_probs = cond_probs[:-1] # do filtering according to min_cond_prob while len(cond_probs) > 0: if cond_probs[-1] > min_cond_prob: break cond_probs = cond_probs[:-1] while len(cond_probs) > 0: if cond_probs[-1] * prev_probs[pos] > min_prob: break # the probability of this branch is too small cond_probs = cond_probs[:-1] if len(cond_probs) == 0: # No available nodes to generate return max_id = max_id[:len(cond_probs)] for word_id, cond_prob in zip(max_id, cond_probs): new_node = GenerateNode(int(word_id), float(cond_prob*prev_probs[pos]), float(cond_prob)) tree.add_node(new_node, nodes[pos]) for child in tree.get_children(nodes[pos]): new_buffer.append((child, current_states[pos])) for j in range(len(_buffer)): _filter_and_append(batch_outputs[j], j) return new_buffer start_time = time.time() model.init_state(sess) buffer = [(parent, _convert_state_to_list(model.current_state, 1)[0])] for i in range(len(seeds), max_step): buffer = _generate(buffer, i) if len(buffer) == 0: break print("total_time: {:f}s, speed: {:f}wps".format(time.time() - start_time, len(tree)/(time.time()-start_time))) for node in tree.nodes(): node.word = self.get_word_from_id(node.word_id) # print(tree.as_dict()) return dict2json(tree.as_dict(), logdir)