def fit(self, text_data_model, text_label_pairs, model_dir_path, batch_size=None, epochs=None,
            test_size=None, random_state=None):
        if batch_size is None:
            batch_size = 64
        if epochs is None:
            epochs = 20
        if test_size is None:
            test_size = 0.3
        if random_state is None:
            random_state = 42

        self.config = text_data_model
        self.idx2word = self.config['idx2word']
        self.word2idx = self.config['word2idx']
        self.max_len = self.config['max_len']
        self.vocab_size = self.config['vocab_size']
        self.labels = self.config['labels']

        np.save(self.get_config_file_path(model_dir_path), self.config)

        self.create_model()
        json = self.model.to_json()
        open(self.get_architecture_file_path(model_dir_path), 'w').write(json)

        xs = []
        ys = []
        for text, label in text_label_pairs:
            tokens = [x.lower() for x in word_tokenize(text)]
            wid_list = list()
            for w in tokens:
                wid = 0
                if w in self.word2idx:
                    wid = self.word2idx[w]
                wid_list.append(wid)
            xs.append(wid_list)
            ys.append(self.labels[label])

        X = pad_sequences(xs, maxlen=self.max_len)
        Y = np_utils.to_categorical(ys, len(self.labels))

        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
        print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

        weight_file_path = self.get_weight_file_path(model_dir_path)

        checkpoint = ModelCheckpoint(weight_file_path)

        history = self.model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs,
                                 validation_data=[x_test, y_test], callbacks=[checkpoint],
                                 verbose=1)

        self.model.save_weights(weight_file_path)

        np.save(model_dir_path + '/' + WordVecCnnLstm.model_name + '-history.npy', history.history)

        score = self.model.evaluate(x=x_test, y=y_test, batch_size=batch_size, verbose=1)
        print('score: ', score[0])
        print('accuracy: ', score[1])

        return history
    def fit(self, text_data_model, text_label_pairs, model_dir_path, batch_size=None, epochs=None,
            test_size=None, random_state=None):
        if batch_size is None:
            batch_size = 64
        if epochs is None:
            epochs = 20
        if test_size is None:
            test_size = 0.3
        if random_state is None:
            random_state = 42

        self.config = text_data_model
        self.idx2word = self.config['idx2word']
        self.word2idx = self.config['word2idx']
        self.max_len = self.config['max_len']
        self.vocab_size = self.config['vocab_size']
        self.labels = self.config['labels']

        np.save(self.get_config_file_path(model_dir_path), self.config)

        self.create_model()
        json = self.model.to_json()
        open(self.get_architecture_file_path(model_dir_path), 'w').write(json)

        ys = []
        X = np.zeros(shape=(len(text_label_pairs), self.glove_model.embedding_dim))
        for i, (text, label) in enumerate(text_label_pairs):
            words = [w.lower() for w in word_tokenize(text)]
            E = np.zeros(shape=(self.glove_model.embedding_dim, self.max_len))
            for j in range(len(words)):
                word = words[j]
                try:
                    E[:, j] = self.glove_model.encode_word(word)
                except KeyError:
                    pass
            X[i, :] = np.sum(E, axis=1)
            ys.append(self.labels[label])
        Y = np_utils.to_categorical(ys, len(self.labels))

        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
        print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

        weight_file_path = self.get_weight_file_path(model_dir_path)

        checkpoint = ModelCheckpoint(weight_file_path)

        history = self.model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs,
                                 validation_data=[x_test, y_test], callbacks=[checkpoint],
                                 verbose=1)

        self.model.save_weights(weight_file_path)

        np.save(model_dir_path + '/' + WordVecGloveFFN.model_name + '-history.npy', history.history)

        score = self.model.evaluate(x=x_test, y=y_test, batch_size=batch_size, verbose=1)
        print('score: ', score[0])
        print('accuracy: ', score[1])

        return history
 def predict(self, sentence):
     xs = []
     tokens = [w.lower() for w in word_tokenize(sentence)]
     wid = [self.word2idx[token] if token in self.word2idx else len(self.word2idx) for token in tokens]
     xs.append(wid)
     x = pad_sequences(xs, self.max_len)
     output = self.model.predict(x)
     return output[0]
Example #4
0
    def encode_docs(self, docs, max_allowed_doc_length=None):
        if max_allowed_doc_length is None:
            max_allowed_doc_length = 500
        doc_count = len(docs)
        X = np.zeros(shape=(doc_count, self.embedding_dim))
        max_len = 0
        for doc in docs:
            max_len = max(max_len, len([word_tokenize(doc)]))
        max_len = min(max_len, max_allowed_doc_length)
        for i in range(0, doc_count):
            doc = docs[i]
            words = [w.lower() for w in word_tokenize(doc)]
            E = np.zeros(shape=(self.embedding_dim, max_len))
            for j in range(max_len):
                word = words[j]
                try:
                    E[:, j] = self.word2em[word]
                except KeyError:
                    pass
            X[i, :] = np.sum(E, axis=1)

        return X
    def predict(self, sentence):

        tokens = [w.lower() for w in word_tokenize(sentence)]

        X = np.zeros(shape=(1, self.glove_model.embedding_dim))
        E = np.zeros(shape=(self.glove_model.embedding_dim, self.max_len))
        for j in range(0, len(tokens)):
            word = tokens[j]
            try:
                E[:, j] = self.glove_model.encode_word(word)
            except KeyError:
                pass
        X[0, :] = np.sum(E, axis=1)
        output = self.model.predict(X)
        return output[0]
Example #6
0
    def encode_doc(self, doc, max_allowed_doc_length=None):
        if max_allowed_doc_length is None:
            max_allowed_doc_length = 500

        words = [w.lower() for w in word_tokenize(doc)]
        max_len = min(len(words), max_allowed_doc_length)
        E = np.zeros(shape=(self.embedding_dim, max_len))
        X = np.zeros(shape=(self.embedding_dim, ))
        for j in range(max_len):
            word = words[j]
            try:
                E[:, j] = self.word2em[word]
            except KeyError:
                pass
        X[:] = np.sum(E, axis=1)
        return X
def fit_text(data_dir_path, max_vocab_size=None, label_type=None):
    if label_type is None:
        label_type = 'line_type'
    if max_vocab_size is None:
        max_vocab_size = 5000
    counter = collections.Counter()
    max_len = 0
    labels = dict()
    for f in os.listdir(data_dir_path):
        data_file_path = os.path.join(data_dir_path, f)
        if os.path.isfile(data_file_path) and f.lower().endswith('.txt'):
            file = open(data_file_path, mode='rt', encoding='utf8')

            for line in file:
                line_type, line_label, sentence = line.strip().split('\t')
                tokens = [x.lower() for x in word_tokenize(sentence)]
                for token in tokens:
                    counter[token] += 1
                max_len = max(max_len, len(tokens))
                label = line_label
                if label_type != 'line_label':
                    label = line_type
                if label not in labels:
                    labels[label] = len(labels)
            file.close()

    word2idx = collections.defaultdict(int)
    for idx, word in enumerate(counter.most_common(max_vocab_size)):
        word2idx[word[0]] = idx
    idx2word = {v: k for k, v in word2idx.items()}
    vocab_size = len(word2idx) + 1

    model = dict()

    model['word2idx'] = word2idx
    model['idx2word'] = idx2word
    model['vocab_size'] = vocab_size
    model['max_len'] = max_len
    model['labels'] = labels

    return model
Example #8
0
    def fit(self,
            text_data_model,
            text_label_pairs,
            model_dir_path,
            test_size=None,
            random_state=None,
            epochs=None,
            batch_size=None):
        if epochs is None:
            epochs = 10
        if batch_size is None:
            batch_size = 16
        if test_size is None:
            test_size = 0.3
        if random_state is None:
            random_state = 42

        self.config = text_data_model
        self.idx2word = self.config['idx2word']
        self.word2idx = self.config['word2idx']
        self.max_len = self.config['max_len']
        self.vocab_size = self.config['vocab_size']
        self.labels = self.config['labels']

        verbose = 1

        config_file_path = WordVecMultiChannelCnn.get_config_file_path(
            model_dir_path)
        np.save(config_file_path, text_data_model)

        max_input_tokens = len(self.word2idx)
        self.model = self.define_model(self.max_len, max_input_tokens)
        open(self.get_architecture_file_path(model_dir_path),
             'wt').write(self.model.to_json())

        xs = []
        ys = []
        for text, label in text_label_pairs:
            tokens = [x.lower() for x in word_tokenize(text)]
            wid_list = list()
            for w in tokens:
                wid = 0
                if w in self.word2idx:
                    wid = self.word2idx[w]
                wid_list.append(wid)
            xs.append(wid_list)
            ys.append(self.labels[label])

        X = pad_sequences(xs, maxlen=self.max_len)
        Y = np_utils.to_categorical(ys, len(self.labels))

        weight_file_path = WordVecMultiChannelCnn.get_weight_file_path(
            model_dir_path)
        checkpoint = ModelCheckpoint(weight_file_path)

        history = self.model.fit([X, X, X],
                                 Y,
                                 epochs=epochs,
                                 batch_size=batch_size,
                                 validation_split=test_size,
                                 verbose=verbose,
                                 callbacks=[checkpoint])
        # save the model
        self.model.save(weight_file_path)

        np.save(
            model_dir_path + '/' + WordVecMultiChannelCnn.model_name +
            '-history.npy', history.history)

        return history