def fit_text(data_file_path, max_vocab_size=None):
    if max_vocab_size is None:
        max_vocab_size = 5000

    counter = collections.Counter()
    file = open(data_file_path, mode='rt', encoding='utf8')
    max_len = 0
    labels = dict()
    for line in file:
        label, sentence = line.strip().split('\t')
        tokens = [x.lower() for x in word_tokenize(sentence)]
        for token in tokens:
            counter[token] += 1
        max_len = max(max_len, len(tokens))
        if label not in labels:
            labels[label] = len(labels)
    file.close()

    word2idx = collections.defaultdict(int)
    for idx, word in enumerate(counter.most_common(max_vocab_size)):
        word2idx[word[0]] = idx
    idx2word = {v: k for k, v in word2idx.items()}
    vocab_size = len(word2idx) + 1

    model = dict()

    model['word2idx'] = word2idx
    model['idx2word'] = idx2word
    model['vocab_size'] = vocab_size
    model['max_len'] = max_len
    model['labels'] = labels

    return model
 def predict(self, sentence):
     xs = []
     tokens = [w.lower() for w in word_tokenize(sentence)]
     wid = [self.word2idx[token] if token in self.word2idx else len(self.word2idx) for token in tokens]
     xs.append(wid)
     x = pad_sequences(xs, self.max_len)
     output = self.model.predict(x)
     return output[0]
    def fit(self, text_data_model, text_label_pairs, model_dir_path,
            test_size=None, random_state=None,
            epochs=None, batch_size=None):
        if epochs is None:
            epochs = 10
        if batch_size is None:
            batch_size = 16
        if test_size is None:
            test_size = 0.3
        if random_state is None:
            random_state = 42

        self.config = text_data_model
        self.idx2word = self.config['idx2word']
        self.word2idx = self.config['word2idx']
        self.max_len = self.config['max_len']
        self.vocab_size = self.config['vocab_size']
        self.labels = self.config['labels']

        verbose = 1

        config_file_path = WordVecMultiChannelCnn.get_config_file_path(model_dir_path)
        np.save(config_file_path, text_data_model)

        max_input_tokens = len(self.word2idx)
        self.model = self.define_model(self.max_len, max_input_tokens)
        open(self.get_architecture_file_path(model_dir_path), 'wt').write(self.model.to_json())

        xs = []
        ys = []
        for text, label in text_label_pairs:
            tokens = [x.lower() for x in word_tokenize(text)]
            wid_list = list()
            for w in tokens:
                wid = 0
                if w in self.word2idx:
                    wid = self.word2idx[w]
                wid_list.append(wid)
            xs.append(wid_list)
            ys.append(self.labels[label])

        X = pad_sequences(xs, maxlen=self.max_len)
        Y = np_utils.to_categorical(ys, len(self.labels))

        weight_file_path = WordVecMultiChannelCnn.get_weight_file_path(model_dir_path)
        checkpoint = ModelCheckpoint(weight_file_path)

        history = self.model.fit([X, X, X], Y, epochs=epochs, batch_size=batch_size,
                                 validation_split=test_size,
                                 verbose=verbose, callbacks=[checkpoint])
        # save the model
        self.model.save(weight_file_path)

        np.save(model_dir_path + '/' + WordVecMultiChannelCnn.model_name + '-history.npy', history.history)

        return history
Example #4
0
def main():
    random_state = 42
    np.random.seed(random_state)

    current_dir = os.path.dirname(__file__)
    sys.path.append(os.path.join(current_dir, '..'))
    current_dir = current_dir if current_dir is not '' else '.'

    data_file_path = current_dir + '/data/umich-sentiment-train.txt'

    from keras_sentiment_analysis.library.utility.simple_data_loader import load_text_label_pairs
    from keras_sentiment_analysis.library.utility.tokenizer_utils import word_tokenize

    text_label_pairs = load_text_label_pairs(data_file_path)

    shuffle(text_label_pairs)

    config_file_path = current_dir + '/models/tf/wordvec_cnn_lstm.csv'
    first_line = True
    max_len = 0
    word2idx = dict()
    with open(config_file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            if first_line:
                first_line = False
                max_len = int(line.strip())
            else:
                if line.startswith('label'):
                    pass
                else:
                    word, idx = line.strip().split('\t')
                    idx = int(idx)
                    word2idx[word] = idx

    with tf.gfile.FastGFile(current_dir + '/models/tf/wordvec_cnn_lstm.pb',
                            'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        _ = tf.import_graph_def(graph_def, name='')

    with tf.Session() as sess:
        [print(n.name) for n in sess.graph.as_graph_def().node]
        predict_op = sess.graph.get_tensor_by_name('output_node0:0')

        for i in range(20):
            sentence, label = text_label_pairs[i]

            xs = []
            tokens = [w.lower() for w in word_tokenize(sentence)]
            wid = [
                word2idx[token] if token in word2idx else len(word2idx)
                for token in tokens
            ]
            xs.append(wid)
            x = pad_sequences(xs, max_len)

            predicted = sess.run(
                predict_op,
                feed_dict={
                    "embedding_1_input:0": x,
                    'spatial_dropout1d_1/keras_learning_phase:0': 0
                })

            print(predicted)