Python word_tokenize Examples, utility.tokenizer_utils.word_tokenize Python Examples

Example #1

0

Show file

File: lstm.py Project: RelationExtraction-DTLAB/predicate_classification

 def predict(self, sentence):
     xs = []
     # tokens = [w.lower() for w in word_tokenize(sentence)]
     tokens = [w for w in word_tokenize(sentence)]
     wid = [
         self.word2idx[token] if token in self.word2idx else 1
         for token in tokens
     ]
     xs.append(wid)
     x = pad_sequences(xs, self.max_len)
     output = self.model.predict(x)
     return output[0]

Example #2

0

Show file

File: text_fit.py Project: RelationExtraction-DTLAB/predicate_classification

def fit_text(data_file_path, label_type, max_vocab_size=None):
    if max_vocab_size is None:
        max_vocab_size = 5000

    counter = collections.Counter()
    file = open(data_file_path, mode='rt', encoding='utf8')
    next(file)  # skip header
    max_len = 0
    labels = dict()

    for line in file:
        lst = line.strip().split(',')
        sentence = lst[0]

        if label_type == 'Predicate':
            label = lst[1]
        elif label_type == 'FrameNet':
            label = lst[2]

        # tokens = [x.lower() for x in word_tokenize(sentence)]
        tokens = [x for x in word_tokenize(sentence)]  # Cased word 유지
        for token in tokens:
            counter[token] += 1
        max_len = max(max_len, len(tokens))

        if label not in labels:  # 라벨에 0부터 번호 부여(라벨당 개수 세는 것이 아님)
            labels[label] = len(labels)
    file.close()

    word2idx = collections.defaultdict(int)
    for idx, word in enumerate(counter.most_common(max_vocab_size)):
        word2idx[word[0]] = idx
    idx2word = {v: k for k, v in word2idx.items()}
    vocab_size = len(word2idx) + 1

    model = dict()

    model['word2idx'] = word2idx
    model['idx2word'] = idx2word
    model['vocab_size'] = vocab_size
    model['max_len'] = max_len
    model['labels'] = labels

    return model

Example #3

0

Show file

File: cnn.py Project: RelationExtraction-DTLAB/predicate_classification

    def fit(self,
            text_data_model,
            text_label_pairs,
            model_dir_path,
            batch_size=None,
            epochs=None,
            test_size=None,
            random_state=None):
        if batch_size is None:
            batch_size = 64
        if epochs is None:
            epochs = 20
        if test_size is None:
            test_size = 0.3
        if random_state is None:
            random_state = 42

        self.config = text_data_model
        self.idx2word = self.config['idx2word']
        self.word2idx = self.config['word2idx']
        self.max_len = self.config['max_len']
        self.vocab_size = self.config['vocab_size']
        self.labels = self.config['labels']

        np.save(self.get_config_file_path(model_dir_path), self.config)

        self.create_model()
        json = self.model.to_json()
        open(self.get_architecture_file_path(model_dir_path), 'w').write(json)

        xs = []
        ys = []
        for text, label in text_label_pairs:
            # tokens = [x.lower() for x in word_tokenize(text)]
            tokens = [x for x in word_tokenize(text)]
            wid_list = list()
            for w in tokens:
                wid = 0
                if w in self.word2idx:
                    wid = self.word2idx[w]
                wid_list.append(wid)
            xs.append(wid_list)
            ys.append(self.labels[str(label)])

        X = pad_sequences(xs, maxlen=self.max_len)
        Y = np_utils.to_categorical(ys, len(self.labels))

        x_train, x_test, y_train, y_test = train_test_split(
            X, Y, test_size=test_size, stratify=Y, random_state=random_state)

        print('===========================================')
        print('Below is the shape of train/test dataset.')
        print('===========================================')
        print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
        print('===========================================')

        weight_file_path = self.get_weight_file_path(model_dir_path)

        checkpoint = ModelCheckpoint(weight_file_path)

        print('===========================================')
        print('======== Now we are on training... ========')
        print('===========================================')

        history = self.model.fit(x=x_train,
                                 y=y_train,
                                 batch_size=batch_size,
                                 epochs=epochs,
                                 validation_data=(x_test, y_test),
                                 callbacks=[checkpoint],
                                 verbose=1)

        self.model.save_weights(weight_file_path)

        np.save(model_dir_path + '/' + WordVecCnn.model_name + '-history.npy',
                history.history)

        # score = self.model.evaluate(x=x_test, y=y_test, batch_size=batch_size, verbose=1)
        # print('score: ', score[0])
        # print('accuracy: ', score[1])
        # print('f1: ', score[2])
        # print('precision: ', score[3])
        # print('recall: ', score[4])

        return history

Example #4

0

Show file

File: cnn.py Project: RelationExtraction-DTLAB/predicate_classification

    def fit(self,
            text_data_model,
            text_label_pairs,
            model_dir_path,
            test_size=None,
            random_state=None,
            epochs=None,
            batch_size=None):
        if epochs is None:
            epochs = 20
        if batch_size is None:
            batch_size = 32
        if test_size is None:
            test_size = 0.3
        if random_state is None:
            random_state = 42

        self.config = text_data_model
        self.idx2word = self.config['idx2word']
        self.word2idx = self.config['word2idx']
        self.max_len = self.config['max_len']
        self.vocab_size = self.config['vocab_size']
        self.labels = self.config['labels']

        verbose = 1

        config_file_path = WordVecMultiChannelCnn.get_config_file_path(
            model_dir_path)
        np.save(config_file_path, text_data_model)

        max_input_tokens = len(self.word2idx)
        self.model = self.define_model(self.max_len, max_input_tokens)
        open(self.get_architecture_file_path(model_dir_path),
             'wt').write(self.model.to_json())

        xs = []
        ys = []
        for text, label in text_label_pairs:
            # tokens = [x.lower() for x in word_tokenize(text)]
            tokens = [x for x in word_tokenize(text)]
            wid_list = list()
            for w in tokens:
                wid = 0
                if w in self.word2idx:
                    wid = self.word2idx[w]
                wid_list.append(wid)
            xs.append(wid_list)
            ys.append(self.labels[str(label)])

        X = pad_sequences(xs, maxlen=self.max_len)
        Y = np_utils.to_categorical(ys, len(self.labels))

        weight_file_path = WordVecMultiChannelCnn.get_weight_file_path(
            model_dir_path)
        checkpoint = ModelCheckpoint(weight_file_path)

        x_train, x_test, y_train, y_test = train_test_split(
            X, Y, test_size=test_size, stratify=Y, random_state=random_state)

        print('===========================================')
        print('Below is the shape of train/test dataset.')
        print('===========================================')
        print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
        print('===========================================')

        print()

        print('===========================================')
        print('======== Now we are on training... ========')
        print('===========================================')

        history = self.model.fit([x_train, x_train, x_train],
                                 y_train,
                                 epochs=epochs,
                                 batch_size=batch_size,
                                 validation_data=([x_test, x_test,
                                                   x_test], y_test),
                                 verbose=verbose,
                                 callbacks=[checkpoint])
        # save the model
        self.model.save(weight_file_path)

        np.save(
            model_dir_path + '/' + WordVecMultiChannelCnn.model_name +
            '-history.npy', history.history)

        return history