Exemple #1
0
def store_dataset_by_default(name, data_dict, force=False):
    dataset_path = get_dataset_path(name)
    for field, data in data_dict.items():
        target_path = os.path.join(dataset_path, field)
        if file_exists(target_path):
            if force:
                dict2json(data, target_path)
                print("{:s} data already exists, overwritten.".format(field))
            else:
                print("{:s} data already exists, if you want to overwrite, use force!".format(field))
        else:
            dict2json(data, target_path)
Exemple #2
0
def solution2json(solution, states_num, labels=None, path=None):
    """
    Convert the tsne solution to json format
    :param solution:
    :param states_num: a list specifying number of states in each layer, should add up the the solution size
    :param labels: additional labels for each states
    :param path:
    :return:
    """
    if isinstance(solution, np.ndarray):
        solution = solution.tolist()
    if isinstance(labels, np.ndarray):
        labels = labels.tolist()
    if labels is None:
        labels = [0] * len(solution)
    layers = []
    state_ids = []
    for i, num in enumerate(states_num):
        layers += [i + 1] * num
        state_ids += list(range(num))
    points = [{
        'coords': s,
        'layer': layers[i],
        'state_id': state_ids[i],
        'label': labels[i]
    } for i, s in enumerate(solution)]
    return dict2json(points, path)
Exemple #3
0
def strength2json(strength_list, words, labels=None, path=None):
    """
    A helper function that convert the results of get_empirical_strength
        to standard format to serve the web request.
    :param strength_list: a list of ndarray (n_layer, n_states)
    :param words: word (str) for each strength
    :param labels: additional labels
    :param path: saving path
    :return:
    """
    if labels is None:
        labels = [0] * len(strength_list)
    points = [{
        'word': words[i],
        'strength': strength.tolist(),
        'label': labels[i]
    } for i, strength in enumerate(strength_list)]
    return dict2json(points, path)
Exemple #4
0
def store_sst(data_path, name, split_scheme, upsert=False):
    """
    Process and store the ptb datasets to db
    :param data_path:
    :param name:
    :param upsert:
    :return:
    """
    if not path_exists(data_path):
        download_sst(os.path.abspath(os.path.join(data_path, '../')))
    if upsert:
        insertion = replace_one_if_exists
    else:
        insertion = insert_one_if_not_exists
    phrase_path = os.path.join(data_path, "dictionary.txt")
    sentence_path = os.path.join(data_path, "datasetSentences.txt")
    label_path = os.path.join(data_path, "sentiment_labels.txt")
    sentence_split_path = os.path.join(data_path, "datasetSplit.txt")
    processor = SSTProcessor(sentence_path, phrase_path, label_path, sentence_split_path)

    split_data = split(list(zip(processor.ids, processor.labels, range(1, len(processor.labels)+1))),
                       split_scheme.values(), shuffle=True)
    split_data = dict(zip(split_scheme.keys(), split_data))
    sentence_data_ids = processor.split_sentence_ids

    word_to_id_json = dict2json(processor.word_to_id)
    insertion('word_to_id', {'name': name}, {'name': name, 'data': word_to_id_json})
    insertion('id_to_word', {'name': name}, {'name': name, 'data': processor.id_to_word})
    for i, set_name in enumerate(['train', 'valid', 'test']):
        data, ids = zip(*(sentence_data_ids[i]))
        insertion('sentences', {'name': name, 'set': set_name},
                  {'name': name, 'set': set_name, 'data': data, 'ids': ids})

    if 'train' not in split_data:
        print('WARN: there is no train data in the split data!')
    data_dict = {}
    for set_name in ['train', 'valid', 'test']:
        if set_name in split_data:
            data, label, ids = zip(*split_data[set_name])
            # convert label to 1,2,3,4,5
            label = [float(i) for i in label]
            label = [(0 if i <= 0.2 else 1 if i <= 0.4 else 2 if i <= 0.6 else 3 if i <= 0.8 else 4) for i in label]
            data_dict[set_name] = {'data': data, 'label': label, 'ids': ids}
    store_dataset_by_default(name, data_dict, upsert)
Exemple #5
0
def store_imdb(data_path, name, n_words=100000, upsert=False):
    if upsert:
        insertion = replace_one_if_exists
    else:
        insertion = insert_one_if_not_exists
    word_to_id, id_to_word = imdb.load_dict(os.path.join(data_path, 'imdb.dict.pkl.gz'), n_words)
    data_label = imdb.load_data(os.path.join(data_path, 'imdb.pkl'), n_words)
    word_to_id_json = dict2json(word_to_id)
    insertion('word_to_id', {'name': name}, {'name': name, 'data': word_to_id_json})
    insertion('id_to_word', {'name': name}, {'name': name, 'data': id_to_word})

    data_dict = {}
    for i, set_name in enumerate(['train', 'valid', 'test']):
        data, label = data_label[i]
        ids = list(range(len(data)))
        # insertion('sentences', {'name': name, 'set': set_name},
        #           {'name': name, 'set': set_name, 'data': data, 'label': label, 'ids': ids})
        data_dict[set_name] = {'data': data, 'label': label, 'ids': ids}
    store_dataset_by_default(name, data_dict, upsert)
Exemple #6
0
def store_ptb(data_path, name='ptb', upsert=False):
    """
    Process and store the ptb datasets to db
    :param data_path:
    :param name:
    :param upsert:
    :return:
    """
    if upsert:
        if get_datasets_by_name(name, ['word_to_id']) is not None:
            print('dataset {:s} already exists. overwriting...'.format(name))
    elif get_datasets_by_name(name, ['word_to_id']) is not None:
        print(
            'dataset {:s} already exists. skipped. If you want to overwrite, use force!'
            .format(name))
        return
    train_path = os.path.join(data_path, "ptb.train.txt")
    valid_path = os.path.join(data_path, "ptb.valid.txt")
    test_path = os.path.join(data_path, "ptb.test.txt")
    paths = [train_path, valid_path, test_path]

    data_list, word_to_id, id_to_word = load_data_as_ids(paths)
    tag_list = load_data_as_pos_tags(paths)
    train_tags, valid_tags, test_tags = tag_list
    train, valid, test = data_list
    word_to_id_json = dict2json(word_to_id)
    if upsert:
        insertion = replace_one_if_exists
    else:
        insertion = insert_one_if_not_exists

    insertion('word_to_id', {'name': name}, {
        'name': name,
        'data': word_to_id_json
    })
    insertion('id_to_word', {'name': name}, {'name': name, 'data': id_to_word})
    data_dict = {
        'train': {
            'data': train
        },
        'valid': {
            'data': valid
        },
        'test': {
            'data': test
        }
    }
    tags_dict = {
        'train_pos': {
            'data': train_tags
        },
        'valid_pos': {
            'data': valid_tags
        },
        'test_pos': {
            'data': test_tags
        }
    }
    id_to_pos = []
    for i, data in enumerate(data_list):
        tags = tag_list[i]
        arr = [None] * len(id_to_word)
        id_to_pos.append(data)
    store_dataset_by_default(name, data_dict, upsert)
    store_dataset_by_default(name, tags_dict, upsert)
Exemple #7
0
def store_plain_text(data_path,
                     name,
                     split_scheme,
                     min_freq=1,
                     max_vocab=10000,
                     remove_punct=False,
                     upsert=False):
    """
    Process any plain text and store to db
    :param data_path:
    :param name:
    :param split_scheme:
    :param min_freq:
    :param max_vocab:
    :param upsert:
    :return:
    """
    if upsert:
        if get_datasets_by_name(name, ['word_to_id']) is not None:
            print('dataset {:s} already exists. overwriting...'.format(name))
    elif get_datasets_by_name(name, ['word_to_id']) is not None:
        print(
            'dataset {:s} already exists. skipped. If you want to overwrite, use force!'
            .format(name))
        return
    if upsert:
        insertion = replace_one_if_exists
    else:
        insertion = insert_one_if_not_exists
    processor = PlainTextProcessor(data_path, remove_punct=remove_punct)
    processor.tag_rare_word(min_freq, max_vocab)
    split_ids_tags = split(
        list(zip(processor.flat_ids, processor.flat_pos_tags)),
        split_scheme.values())
    split_ids = []
    split_tags = []
    for split_id_tag in split_ids_tags:
        ids, tags = zip(*split_id_tag)
        split_ids.append(ids)
        split_tags.append(tags)
    split_data = dict(zip(split_scheme.keys(), split_ids))
    split_tags = dict(zip(split_scheme.keys(), split_tags))
    if 'train' not in split_data:
        print('WARN: there is no train data in the split data!')
    data_dict = {}
    for set_name in ['train', 'valid', 'test']:
        if set_name in split_data:
            data_dict[set_name] = {'data': split_data[set_name]}
    tags_dict = {}
    for set_name in ['train', 'valid', 'test']:
        if set_name in split_tags:
            tags_dict[set_name + '_pos'] = {'data': split_tags[set_name]}
    store_dataset_by_default(name, data_dict, upsert)
    store_dataset_by_default(name, tags_dict, upsert)
    insertion('word_to_id', {'name': name}, {
        'name': name,
        'data': dict2json(processor.word_to_id)
    })
    insertion('id_to_word', {'name': name}, {
        'name': name,
        'data': processor.id_to_word
    })
Exemple #8
0
def store_yelp(data_path, name, n_words=10000, upsert=False):
    if upsert:
        insertion = replace_one_if_exists
    else:
        insertion = insert_one_if_not_exists
    with open(os.path.join(data_path, 'review_label.json'), 'r') as file:
        data = json.load(file)
    if name == 'yelp-2':
        tmp_ = []
        for item in data:
            if item['label'] == 3:
                continue
            item['label'] = 0 if item['label'] < 3 else 1
            tmp_.append(item)
        data = tmp_
    training_data, validate_data, test_data = split(data, fractions=[0.8, 0.1, 0.1], shuffle=True)
    all_words = []
    reviews = []
    stars = []
    for item in training_data:
        tokenized_review = list(itertools.chain.from_iterable(tokenize(item['review'], remove_punct=True)[0]))
        reviews.append(tokenized_review)
        stars.append(item['label'])
        all_words.extend(tokenized_review)
    # for w in all_words:
    #     if isinstance(w, list):
    #         print("found a list" + str(w))
    word_to_id, counter, words = tokens2vocab(all_words)
    n_words -= 1
    word_to_id = {k: v+1 for k, v in word_to_id.items() if v < n_words}
    word_to_id['<unk>'] = 0

    id_to_word = [None] * len(word_to_id)
    for word, id_ in word_to_id.items():
        id_to_word[id_] = word

    reviews = [[word_to_id[t] if word_to_id.get(t) else 0 for t in sentence] for sentence in reviews]
    training_data = (reviews, stars)

    tmp_data = []
    for _data in [validate_data, test_data]:
        reviews = []
        stars = []
        for item in _data:
            tokenized_review = list(itertools.chain.from_iterable(tokenize(item['review'])[0]))
            reviews.append([word_to_id[t] if word_to_id.get(t) else 0 for t in tokenized_review])
            stars.append(item['label'])
        tmp_data.append((reviews, stars))
    validate_data = tmp_data[0]
    test_data = tmp_data[1]

    word_to_id_json = dict2json(word_to_id)
    insertion('word_to_id', {'name': name}, {'name': name, 'data': word_to_id_json})
    insertion('id_to_word', {'name': name}, {'name': name, 'data': id_to_word})

    data_names = ['train', 'valid', 'test']
    data_dict = {}
    for i, data_set in enumerate([training_data, validate_data, test_data]):
        data_set = tuple(zip(*sorted(zip(*data_set), key=lambda x: len(x[0]))))
        data, label = data_set
        ids = list(range(len(data)))
        data_dict[data_names[i]] = {'data': data, 'label': label, 'ids': ids}
        insertion('sentences', {'name': name, 'set': data_names[i]},
                  {'name': name, 'set': data_names[i], 'data': data, 'label': label, 'ids': ids})
    store_dataset_by_default(name, data_dict, upsert)
Exemple #9
0
    def generate(self, sess, seeds, logdir=None, max_branch=3, accum_cond_prob=0.9,
                 min_cond_prob=0.1, min_prob=0.001, max_step=10, neg_word_ids=None):
        """
        Generate sequence tree with given seed (a word_id) and certain requirements
        Note that the method always try to generate as much branches as possible.
        :param sess: the sess to run the model
        :param seeds: a list of word_id or a list of words
        :param logdir: the file path to save the generating tree
        :param max_branch: the maximum number of branches at each node
        :param accum_cond_prob: the maximum accumulate conditional probability of the following branches
        :param min_cond_prob: the minimum conditional probability of each branch
        :param min_prob: the minimum probability of a branch (note that this indicates a multiplication along the tree)
        :param max_step: the step to generate
        :param neg_word_ids: a set of neglected words or words' ids.
        :return: if logdir is None, returns a dict object representing the tree. if logdir is not None, return None
        """

        model = self.model
        model.reset_state()
        # Initialize the tree and inserts the seeds node
        tree = Tree()
        # converts words into ids
        if (not isinstance(seeds, list)) or len(seeds) < 1:
            raise ValueError("seeds should be a list of words or ids")
        if isinstance(seeds[0], str):
            _seeds = self.get_id_from_word(seeds)
            seeds = _seeds
        parent = GenerateNode(seeds[0], 1.0, 1.0)
        tree.add_node(parent, None)
        for seed in seeds[1:]:
            node = GenerateNode(seed, 1.0, 1.0)
            tree.add_node(node, parent)
            parent = node
        if neg_word_ids is None:
            neg_word_ids = {}
        elif isinstance(neg_word_ids[0], str):
            neg_word_ids = self.get_id_from_word(neg_word_ids)
        elif not isinstance(neg_word_ids[0], int):
            raise TypeError("neg_word_ids should be a iterable object containing words as word tokens or word ids!")
        neg_word_ids = set(neg_word_ids)  # converts to set for easier `in` statement
        # print(neg_word_ids)

        buffer_size = self.model.batch_size

        def _generate(_buffer, step):
            if step > max_step:  # Already at the maximum generating step
                return
            if len(_buffer) > buffer_size:
                _b = []
                for j in range(0, len(_buffer), buffer_size):
                    _b += _generate(_buffer[j:(j+buffer_size)], step)
                return _b
            nodes, states = zip(*_buffer)
            word_ids = [n.word_id for n in nodes]
            prev_probs = [n.prob for n in nodes]
            states = _pack_list_to_states(states, buffer_size)
            # padding -1s to make sure that shape match
            word_ids += [-1] * (buffer_size - len(word_ids))

            # prev_prob = node.prob
            # The second inputs is just to hold place. See the implementation of model.run()
            model.current_state = states
            evals, _ = model.run(np.array(word_ids).reshape(buffer_size, 1), None, 1, sess,
                                 eval_ops={'projected': model.projected_outputs})
            new_buffer = []
            # shape: [batch_size * num_steps, project_size]
            batch_outputs = evals['projected'][0]
            current_states = _convert_state_to_list(model.current_state, len(_buffer))

            def _filter_and_append(outputs, pos):

                # do softmax so that outputs represents probs
                outputs = losses.softmax(outputs)
                # Get sorted k max probs and their ids,
                # since we will neglect some of them latter, we first get a bit more of the top k
                max_id = np.argpartition(-outputs, max_branch)[:(max_branch+len(neg_word_ids))]
                del_ids = []
                for i, id_ in enumerate(max_id):
                    if int(id_) in neg_word_ids:
                        del_ids.append(i)
                max_id = np.delete(max_id, del_ids)
                max_id = max_id[:max_branch]
                cond_probs = outputs[max_id]
                # Sort the cond_probs for later filtering use
                sort_indice = np.argsort(-cond_probs)
                max_id = max_id[sort_indice]
                cond_probs = cond_probs[sort_indice]
                prob_sum = np.sum(cond_probs)
                # do filtering according to accum_prob
                while len(cond_probs) > 0:
                    if accum_cond_prob > prob_sum:
                        break
                    prob_sum -= cond_probs[-1]
                    cond_probs = cond_probs[:-1]
                # do filtering according to min_cond_prob
                while len(cond_probs) > 0:
                    if cond_probs[-1] > min_cond_prob:
                        break
                    cond_probs = cond_probs[:-1]
                while len(cond_probs) > 0:
                    if cond_probs[-1] * prev_probs[pos] > min_prob:
                        break
                    # the probability of this branch is too small
                    cond_probs = cond_probs[:-1]
                if len(cond_probs) == 0:  # No available nodes to generate
                    return
                max_id = max_id[:len(cond_probs)]
                for word_id, cond_prob in zip(max_id, cond_probs):
                    new_node = GenerateNode(int(word_id), float(cond_prob*prev_probs[pos]), float(cond_prob))
                    tree.add_node(new_node, nodes[pos])
                for child in tree.get_children(nodes[pos]):
                    new_buffer.append((child, current_states[pos]))

            for j in range(len(_buffer)):
                _filter_and_append(batch_outputs[j], j)

            return new_buffer

        start_time = time.time()
        model.init_state(sess)
        buffer = [(parent, _convert_state_to_list(model.current_state, 1)[0])]
        for i in range(len(seeds), max_step):
            buffer = _generate(buffer, i)
            if len(buffer) == 0:
                break
        print("total_time: {:f}s, speed: {:f}wps".format(time.time() - start_time, len(tree)/(time.time()-start_time)))
        for node in tree.nodes():
            node.word = self.get_word_from_id(node.word_id)
        # print(tree.as_dict())
        return dict2json(tree.as_dict(), logdir)