Exemple #1
0
    def test_load_embd(self):
        sentences = [
            ['All', 'work', 'and', 'no', 'play'],
            ['makes', 'Jack', 'a', 'dull', 'boy', '.'],
        ]
        dict_generator = get_dicts_generator(
            word_min_freq=1,
            char_min_freq=1,
            word_ignore_case=False,
            char_ignore_case=False,
        )
        for sentence in sentences:
            dict_generator(sentence)
        word_dict, char_dict, _ = dict_generator(return_dict=True)

        current = os.path.dirname(os.path.abspath(__file__))
        word_embd_file_path = os.path.join(current, 'demo_word_embd.txt')
        weights = get_embedding_weights_from_file(word_dict,
                                                  word_embd_file_path,
                                                  ignore_case=True)
        self.assertEqual((len(word_dict), 3), weights.shape)
        self.assertEqual([0.1, 0.2, 0.3], weights[word_dict['All']].tolist())
        self.assertEqual([0.4, 0.5, 0.6], weights[word_dict['work']].tolist())
        self.assertEqual([0.7, 0.8, 0.9], weights[word_dict['and']].tolist())

        char_embd_file_path = os.path.join(current, 'demo_char_embd.txt')
        weights = get_embedding_weights_from_file(char_dict,
                                                  char_embd_file_path,
                                                  ignore_case=True)
        self.assertEqual((len(char_dict), 3), weights.shape)
        self.assertEqual([0.1, 0.2, 0.3], weights[char_dict['A']].tolist())
        self.assertEqual([0.4, 0.5, 0.6], weights[char_dict['l']].tolist())
        self.assertEqual([0.7, 0.8, 0.9], weights[char_dict['w']].tolist())
Exemple #2
0
 def test_ignore_case(self):
     sentences = [
         ['All', 'work', 'and', 'no', 'play', ''],
         ['all', 'worK', 'and', 'no', 'play', '.'],
     ]
     dict_generator = get_dicts_generator(
         word_min_freq=2,
         char_min_freq=2,
         word_ignore_case=True,
         char_ignore_case=True,
     )
     for sentence in sentences:
         dict_generator(sentence)
     word_dict, char_dict, max_word_len = dict_generator(return_dict=True)
     self.assertEqual(4, max_word_len)
     self.assertEqual(7, len(word_dict))
     self.assertTrue('all' in word_dict)
     self.assertTrue('work' in word_dict)
     self.assertTrue('k' in char_dict)
Exemple #3
0
 def test_no_word(self):
     sentences = [
         ['All', 'work', 'and', 'no', 'play'],
         ['makes', 'Jack', 'a', 'dull', 'boy', '.'],
     ]
     dict_generator = get_dicts_generator(
         word_min_freq=2,
         char_min_freq=2,
         word_ignore_case=False,
         char_ignore_case=False,
     )
     for sentence in sentences:
         dict_generator(sentence)
     word_dict, char_dict, max_word_len = dict_generator(return_dict=True)
     self.assertEqual(0, max_word_len)
     self.assertEqual(2, len(word_dict))
     self.assertTrue('u' not in char_dict)
     self.assertTrue('A' not in char_dict)
     self.assertTrue('n' in char_dict)
     self.assertTrue('a' in char_dict)
Exemple #4
0
        targets_test)
# one-hot encoding for class labels
onehot_train, onehot_dev, onehot_test = to_categorical(
    encoded_labels_train), to_categorical(encoded_labels_dev), to_categorical(
        encoded_labels_test)

print("max sequence length:", max(len(s) for s in sentences_train))
print("min sequence length:", min(len(s) for s in sentences_train))
s = sorted(len(s) for s in sentences_train)
print("median sequence length:", s[len(s) // 2])
del train

#--- Generate dictionaries for words and characters
dicts_generator = get_dicts_generator(
    word_min_freq=5,
    char_min_freq=2,
    word_ignore_case=True,
    char_ignore_case=False,
)
for sentence in sentences_train:
    dicts_generator(get_word_list_eng(sentence))
word_dict, char_dict, max_word_len = dicts_generator(
    return_dict=True
)  #dict object here are word2index dict(or char2index), gives index of word(char) in the vocabulary
print('Word dict size: %d  Char dict size: %d  Max word len: %d' %
      (len(word_dict), len(char_dict), max_word_len))

#--- Write word and char dict to json files
with open(WORD_DICT, 'a') as output_wd:
    json.dump(word_dict, output_wd, ensure_ascii=False)
    output_wd.write('\n')
with open(CHAR_DICT, 'a') as output_cd:
Exemple #5
0
    def train(self, data=None, *args, **kwargs):
        """
        This method is for training the cnn model. After training procedure, the model will be saved in model.h5 file
        :param data: is not used in this method since the training and validating files has been given in read_dataset() method
        :return: None
        """
        dicts_generator = get_dicts_generator(word_min_freq=2,
                                              char_min_freq=2,
                                              word_ignore_case=True,
                                              char_ignore_case=False)
        for sentence in self.train_sentences:
            dicts_generator(sentence)
        self.word_dict, self.char_dict, self.max_word_len = dicts_generator(
            return_dict=True)
        if os.path.exists(self.WORD_EMBD_PATH):
            print('Embedding...')
            self.word_dict = {
                '': 0,
                '<UNK>': 1,
            }
            with codecs.open(self.WORD_EMBD_PATH, 'r', 'utf8') as reader:
                print('Embedding open file')
                for line in reader:
                    line = line.strip()
                    if not line:
                        continue
                    word = line.split()[0].lower()
                    if word not in self.word_dict:
                        self.word_dict[word] = len(self.word_dict)
                print('Embedding for loop')
            self.word_embd_weights = get_embedding_weights_from_file(
                self.word_dict,
                self.WORD_EMBD_PATH,
                ignore_case=True,
            )
            print('Embedding done')
        else:
            self.word_embd_weights = None
            raise NameError('embedding file is not found')
        print('Embedding all done')
        train_steps = (len(self.train_sentences) + self.BATCH_SIZE -
                       1) // self.BATCH_SIZE
        valid_steps = (len(self.valid_sentences) + self.BATCH_SIZE -
                       1) // self.BATCH_SIZE

        self.model = build_model(rnn_num=self.RNN_NUM,
                                 rnn_units=self.RNN_UNITS,
                                 word_dict_len=len(self.word_dict),
                                 char_dict_len=len(self.char_dict),
                                 max_word_len=self.max_word_len,
                                 output_dim=len(self.TAGS),
                                 word_embd_weights=self.word_embd_weights)
        self.model.summary()

        if os.path.exists(self.MODEL_PATH):
            print("loading model from: ", self.MODEL_PATH)
            self.model.load_weights(self.MODEL_PATH, by_name=True)
        else:
            print('Fitting...')
            self.model.fit_generator(
                generator=self.batch_generator(self.train_sentences,
                                               self.train_taggings,
                                               train_steps),
                steps_per_epoch=train_steps,
                epochs=self.EPOCHS,
                validation_data=self.batch_generator(self.valid_sentences,
                                                     self.valid_taggings,
                                                     valid_steps),
                validation_steps=valid_steps,
                callbacks=[
                    keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=2),
                    keras.callbacks.EarlyStopping(
                        monitor='val_categorical_accuracy', patience=2),
                ],
                verbose=True,
            )

            self.model.save_weights(self.MODEL_PATH)
Exemple #6
0
    return Model(
        inputs=[premise_word_input_layer, hypothesis_word_input_layer],
        outputs=x)


training = get_data('../data/snli_1.0_train.jsonl')
validation = get_data('../data/snli_1.0_dev.jsonl')
test = get_data('../data/snli_1.0_test.jsonl')
sentences = training[0] + training[1] + validation[0] + validation[1] + test[
    0] + test[1]

from keras_wc_embd import get_dicts_generator

dict_generator = get_dicts_generator(
    word_min_freq=1,
    char_min_freq=1,
    word_ignore_case=False,
    char_ignore_case=False,
)
for sentence in sentences:
    dict_generator(sentence)

word_dict, char_dict, _ = dict_generator(return_dict=True)


def get_input(sentences,
              word_unknown=1,
              char_unknown=1,
              word_ignore_case=False,
              char_ignore_case=False):
    sentence_num = len(sentences)
    max_sentence_len = params['max_length']
Exemple #7
0
            if parts[0] != '-DOCSTART-':
                sentences[-1].append(parts[0])
                taggings[-1].append(TAGS[parts[-1]])
    if not sentences[-1]:
        sentences.pop()
        taggings.pop()
    return sentences, taggings


print('Loading...')
train_sentences, train_taggings = load_data(DATA_TRAIN_PATH)
valid_sentences, valid_taggings = load_data(DATA_VALID_PATH)

dicts_generator = get_dicts_generator(
    word_min_freq=2,
    char_min_freq=2,
    word_ignore_case=True,
    char_ignore_case=False
)
for sentence in train_sentences:
    dicts_generator(sentence)
word_dict, char_dict, max_word_len = dicts_generator(return_dict=True)

if os.path.exists(WORD_EMBD_PATH):
    print('Embedding...')
    word_dict = {
        '': 0,
        '<UNK>': 1,
    }
    with codecs.open(WORD_EMBD_PATH, 'r', 'utf8') as reader:
        for line in reader:
            line = line.strip()