Esempio n. 1
0
 def test_negative01(self):
     y = [
         {"LOCATION": [(55, 63), (66, 84), (87, 93)], "PERSON": [(281, 289)]}
     ]
     true_err_msg = re.escape('There are too few samples in the data set! Minimal number of samples is 2.')
     with self.assertRaisesRegex(ValueError, true_err_msg):
         _, _ = split_dataset(y, 0.3333, n_restarts=4)
Esempio n. 2
0
 def test_negative02(self):
     X = np.array([
         '01abc', '02def', '03ghi', '04jkl', '05mno', '06pqr', '07stu',
         '08vwx', '09yza', '10bcd', '11efg', '12hij'
     ],
                  dtype=np.str)
     y_tokenized = np.array([
         [0, 0, 2, 1, 1, 0, 0, 0, 2, 0, 4, 3, 0],
         [2, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 4, 0, 0, 4, 3, 0, 0, 0, 0, 0],
         [4, 3, 0, 4, 3, 3, 0, 0, 0, 0, 0, 2, 1],
         [0, 0, 0, 2, 1, 0, 0, 0, 4, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 6, 5, 5, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 6, 5, 4, 3, 0, 0, 0],
         [0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 2, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 0],
         [0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 2, 1, 1],
     ],
                            dtype=np.int32)
     true_err_msg = re.escape(
         '{0} is too small value of the test part! '
         'There are no samples for testing subset!'.format(0.01))
     with self.assertRaisesRegex(ValueError, true_err_msg):
         _, _ = split_dataset(X,
                              y_tokenized,
                              0.01,
                              n_restarts=4,
                              random_seed=0)
Esempio n. 3
0
 def test_negative04(self):
     X = np.array([
         '01abc', '02def', '03ghi', '04jkl', '05mno', '06pqr', '07stu',
         '08vwx', '09yza', '10bcd', '11efg', '12hij'
     ],
                  dtype=np.str)
     y_tokenized = np.array([
         [0, 0, 2, 1, 1, 0, 0, 0, 2, 0, 4, 3, 0],
         [2, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 4, 0, 0, 4, 3, 0, 0, 0, 0, 0],
         [4, 3, 0, 4, 3, 3, 0, 0, 0, 0, 0, 2, 1],
         [0, 0, 0, 2, 1, 0, 0, 0, 4, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 6, 5, 5, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 6, 5, 4, 3, 0, 0, 0],
         [0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 2, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 0],
         [0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 2, 1, 1],
     ],
                            dtype=np.int32)
     true_err_msg = re.escape(
         '1 is too small value of restarts number. It must be greater than 1.'
     )
     with self.assertRaisesRegex(ValueError, true_err_msg):
         _, _ = split_dataset(X,
                              y_tokenized,
                              0.3333,
                              n_restarts=1,
                              random_seed=0)
Esempio n. 4
0
 def test_negative04(self):
     y = [
         {"LOCATION": [(55, 63), (66, 84), (87, 93)], "PERSON": [(281, 289)]},
         {"PERSON": [(33, 44)], "LOCATION": [(198, 204), (189, 197), (168, 185)], "ORG": [(230, 249)]},
         {"PERSON": [(87, 98)], "ORG": [(18, 42), (18, 56)]},
         {"LOCATION": [(151, 157)], "PERSON": [(130, 140)]}
     ]
     true_err_msg = re.escape('1 is too small value of restarts number. It must be greater than 1.')
     with self.assertRaisesRegex(ValueError, true_err_msg):
         _, _ = split_dataset(y, 0.3333, n_restarts=1)
Esempio n. 5
0
 def test_negative03(self):
     y = [
         {"LOCATION": [(55, 63), (66, 84), (87, 93)], "PERSON": [(281, 289)]},
         {"PERSON": [(33, 44)], "LOCATION": [(198, 204), (189, 197), (168, 185)], "ORG": [(230, 249)]},
         {"PERSON": [(87, 98)], "ORG": [(18, 42), (18, 56)]},
         {"LOCATION": [(151, 157)], "PERSON": [(130, 140)]}
     ]
     true_err_msg = re.escape('{0} is too large value of the test part! '
                              'There are no samples for training subset!'.format(0.99))
     with self.assertRaisesRegex(ValueError, true_err_msg):
         _, _ = split_dataset(y, 0.99, n_restarts=4)
Esempio n. 6
0
 def test_negative01(self):
     X = np.array([
         '01abc',
     ], dtype=np.str)
     y_tokenized = np.array([
         [0, 0, 2, 1, 1, 0, 0, 0, 2, 0, 4, 3, 0],
     ],
                            dtype=np.int32)
     true_err_msg = re.escape(
         'There are too few samples in the data set! Minimal number of samples is 2.'
     )
     with self.assertRaisesRegex(ValueError, true_err_msg):
         _, _ = split_dataset(X,
                              y_tokenized,
                              0.3333,
                              n_restarts=4,
                              random_seed=0)
Esempio n. 7
0
 def test_positive01(self):
     base_dir = os.path.join(os.path.dirname(__file__), 'testdata')
     _, y = load_dataset_from_json(os.path.join(base_dir, 'true_named_entities.json'))
     train_index, test_index = split_dataset(y, 0.3, 10)
     self.assertIsInstance(train_index, np.ndarray)
     self.assertIsInstance(test_index, np.ndarray)
     self.assertEqual(len(y), len(train_index) + len(test_index))
     self.assertEqual(len(train_index), len(set(train_index.tolist())))
     self.assertEqual(len(test_index), len(set(test_index.tolist())))
     self.assertEqual(0, len(set(train_index.tolist()) & set(test_index.tolist())))
     true_set_of_classes = {'ORG', 'PERSON', 'LOCATION'}
     set_of_classes_for_training = set()
     for idx in train_index:
         set_of_classes_for_training |= set(y[idx].keys())
     set_of_classes_for_testing = set()
     for idx in test_index:
         set_of_classes_for_testing |= set(y[idx].keys())
     self.assertEqual(set_of_classes_for_training, set_of_classes_for_testing)
     self.assertEqual(true_set_of_classes, set_of_classes_for_training)
     self.assertEqual(true_set_of_classes, set_of_classes_for_testing)
Esempio n. 8
0
 def test_positive01(self):
     X = np.array([
         '01abc', '02def', '03ghi', '04jkl', '05mno', '06pqr', '07stu',
         '08vwx', '09yza', '10bcd', '11efg', '12hij'
     ],
                  dtype=np.str)
     y_tokenized = np.array(
         [
             [0, 0, 2, 1, 1, 0, 0, 0, 2, 0, 4, 3, 0],  # 0 1 2 3 4 # 0 2 3 4
             [0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0 2 6     # 0
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0         # 0
             [0, 0, 0, 4, 0, 0, 4, 3, 0, 0, 0, 0, 0],  # 0 3 4     # 0 3 4
             [4, 3, 0, 4, 3, 3, 0, 0, 0, 0, 0, 2, 1],  # 0 1 2 3 4 # 0 2 3 4
             [0, 0, 0, 2, 1, 0, 0, 0, 4, 0, 0, 0, 0],  # 0 1 2 4   # 0 2 4
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0         # 0
             [0, 0, 0, 6, 5, 5, 0, 0, 0, 0, 0, 0, 0],  # 0 5 6     # 0
             [0, 0, 0, 0, 0, 0, 6, 5, 4, 3, 0, 0, 0],  # 0 3 4 5 6 # 0 3 4
             [0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 2, 0, 0],  # 0 2 4     # 0 2 4
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 0],  # 0 3 4     # 0 3 4
             [0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 2, 1, 1],  # 0 1 2 4   # 0 2 4
         ],
         dtype=np.int32)
     # 0: 120
     # 1: 6
     # 2: 7
     # 3: 7
     # 4: 11
     # 5: 3
     # 6: 3
     true_indices_for_training = np.array([1, 4, 5, 6, 7, 8, 10, 11],
                                          dtype=np.int32)
     true_indices_for_testing = np.array([0, 2, 3, 9], dtype=np.int32)
     calc_indices_for_training, calc_indices_for_testing = split_dataset(
         X, y_tokenized, 0.3333, n_restarts=4, random_seed=0)
     self.assertIsInstance(calc_indices_for_training, np.ndarray)
     self.assertIsInstance(calc_indices_for_testing, np.ndarray)
     self.assertEqual(true_indices_for_training.tolist(),
                      calc_indices_for_training.tolist())
     self.assertEqual(true_indices_for_testing.tolist(),
                      calc_indices_for_testing.tolist())
def train(factrueval2016_devset_dir: str,
          split_by_paragraphs: bool,
          bert_will_be_tuned: bool,
          use_lang_features: bool,
          use_shapes: bool,
          lstm_layer_size: Union[int, None],
          l2: float,
          max_epochs: int,
          patience: int,
          batch_size: int,
          gpu_memory_frac: float,
          model_name: str,
          collection3_dir: Union[str, None] = None,
          n_max_samples: int = 0) -> BERT_NER:
    if os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            recognizer = pickle.load(fp)
        assert isinstance(recognizer, BERT_NER)
        print('The NER has been successfully loaded from the file `{0}`...'.
              format(model_name))
        print('')
    else:
        temp_json_name = tempfile.NamedTemporaryFile(mode='w').name
        try:
            factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name,
                                   split_by_paragraphs)
            X, y = load_dataset_from_json(temp_json_name)
        finally:
            if os.path.isfile(temp_json_name):
                os.remove(temp_json_name)
        print('The FactRuEval-2016 data for training have been loaded...')
        print('Number of samples is {0}.'.format(len(y)))
        print('')
        if BERT_NER.PATH_TO_BERT is None:
            bert_hub_module_handle = 'https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1'
        else:
            bert_hub_module_handle = None
        recognizer = BERT_NER(finetune_bert=bert_will_be_tuned,
                              batch_size=batch_size,
                              l2_reg=l2,
                              bert_hub_module_handle=bert_hub_module_handle,
                              lstm_units=lstm_layer_size,
                              validation_fraction=0.25,
                              max_epochs=max_epochs,
                              patience=patience,
                              gpu_memory_frac=gpu_memory_frac,
                              verbose=True,
                              random_seed=42,
                              lr=3e-6 if bert_will_be_tuned else 1e-4,
                              udpipe_lang='ru',
                              use_nlp_features=use_lang_features,
                              use_shapes=use_shapes)
        if collection3_dir is None:
            if n_max_samples > 0:
                train_index, test_index = split_dataset(
                    y=y, test_part=recognizer.validation_fraction)
                X_train = np.array(X, dtype=object)[train_index]
                y_train = np.array(y, dtype=object)[train_index]
                X_val = np.array(X, dtype=object)[test_index]
                y_val = np.array(y, dtype=object)[test_index]
                del train_index, test_index
                index = sample_from_dataset(y=y_train, n=n_max_samples)
                recognizer.fit(X_train[index],
                               y_train[index],
                               validation_data=(X_val, y_val))
            else:
                recognizer.fit(X, y)
        else:
            X_train, y_train = load_dataset_from_brat(collection3_dir,
                                                      split_by_paragraphs=True)
            if not split_by_paragraphs:
                X_train, y_train = divide_dataset_by_sentences(
                    X_train, y_train, sent_tokenize_func=ru_sent_tokenize)
            for sample_idx in range(len(y_train)):
                new_y_sample = dict()
                for ne_type in sorted(list(y_train[sample_idx].keys())):
                    if ne_type == 'PER':
                        new_y_sample['PERSON'] = y_train[sample_idx][ne_type]
                    elif ne_type == 'LOC':
                        new_y_sample['LOCATION'] = y_train[sample_idx][ne_type]
                    else:
                        new_y_sample[ne_type] = y_train[sample_idx][ne_type]
                y_train[sample_idx] = new_y_sample
                del new_y_sample
            print('The Collection3 data for training have been loaded...')
            print('Number of samples is {0}.'.format(len(y_train)))
            print('')
            if n_max_samples > 0:
                index = sample_from_dataset(y=y_train, n=n_max_samples)
                X_train = np.array(X_train, dtype=object)[index]
                y_train = np.array(y_train, dtype=object)[index]
                del index
            recognizer.fit(X_train, y_train, validation_data=(X, y))
        with open(model_name, 'wb') as fp:
            pickle.dump(recognizer, fp)
        print('')
        print(
            'The NER has been successfully fitted and saved into the file `{0}`...'
            .format(model_name))
        print('')
    return recognizer
Esempio n. 10
0
def train(factrueval2016_devset_dir: str,
          split_by_paragraphs: bool,
          elmo_will_be_tuned: bool,
          use_lang_features: bool,
          use_shapes: bool,
          max_epochs: int,
          patience: int,
          batch_size: int,
          lr: float,
          l2: float,
          gpu_memory_frac: float,
          model_name: str,
          collection3_dir: Union[str, None] = None,
          n_max_samples: int = 0) -> ELMo_NER:
    if os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            recognizer = pickle.load(fp)
        assert isinstance(recognizer, ELMo_NER)
        print('The NER has been successfully loaded from the file `{0}`...'.
              format(model_name))
        print('')
    else:
        temp_json_name = tempfile.NamedTemporaryFile(mode='w').name
        try:
            factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name,
                                   split_by_paragraphs)
            X, y = load_dataset_from_json(temp_json_name)
        finally:
            if os.path.isfile(temp_json_name):
                os.remove(temp_json_name)
        print('The FactRuEval-2016 data for training have been loaded...')
        print('Number of samples is {0}.'.format(len(y)))
        print('')
        max_number_of_tokens = 0
        pipeline = create_udpipe_pipeline('ru')
        for cur in X:
            spacy_doc = pipeline(cur)
            n_tokens = 0
            for _ in spacy_doc:
                n_tokens += 1
            del spacy_doc
            if n_tokens > max_number_of_tokens:
                max_number_of_tokens = n_tokens
        del pipeline
        print('Maximal number of tokens is {0}.'.format(max_number_of_tokens))
        n_tokens = 2
        while n_tokens < max_number_of_tokens:
            n_tokens *= 2
        elmo_hub_module_handle = 'http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz'
        recognizer = ELMo_NER(finetune_elmo=elmo_will_be_tuned,
                              batch_size=batch_size,
                              l2_reg=l2,
                              max_seq_length=n_tokens,
                              elmo_hub_module_handle=elmo_hub_module_handle,
                              validation_fraction=0.25,
                              max_epochs=max_epochs,
                              patience=patience,
                              gpu_memory_frac=gpu_memory_frac,
                              verbose=True,
                              random_seed=42,
                              lr=lr,
                              udpipe_lang='ru',
                              use_nlp_features=use_lang_features,
                              use_shapes=use_shapes)
        if collection3_dir is None:
            if n_max_samples > 0:
                train_index, test_index = split_dataset(
                    y=y, test_part=recognizer.validation_fraction)
                X_train = np.array(X, dtype=object)[train_index]
                y_train = np.array(y, dtype=object)[train_index]
                X_val = np.array(X, dtype=object)[test_index]
                y_val = np.array(y, dtype=object)[test_index]
                del train_index, test_index
                index = sample_from_dataset(y=y_train, n=n_max_samples)
                recognizer.fit(X_train[index],
                               y_train[index],
                               validation_data=(X_val, y_val))
            recognizer.fit(X, y)
        else:
            X_train, y_train = load_dataset_from_brat(collection3_dir,
                                                      split_by_paragraphs=True)
            if not split_by_paragraphs:
                X_train, y_train = divide_dataset_by_sentences(
                    X_train, y_train, sent_tokenize_func=ru_sent_tokenize)
            for sample_idx in range(len(y_train)):
                new_y_sample = dict()
                for ne_type in sorted(list(y_train[sample_idx].keys())):
                    if ne_type == 'PER':
                        new_y_sample['PERSON'] = y_train[sample_idx][ne_type]
                    elif ne_type == 'LOC':
                        new_y_sample['LOCATION'] = y_train[sample_idx][ne_type]
                    else:
                        new_y_sample[ne_type] = y_train[sample_idx][ne_type]
                y_train[sample_idx] = new_y_sample
                del new_y_sample
            print('The Collection3 data for training have been loaded...')
            print('Number of samples is {0}.'.format(len(y_train)))
            print('')
            if n_max_samples > 0:
                index = sample_from_dataset(y=y_train, n=n_max_samples)
                X_train = np.array(X_train, dtype=object)[index]
                y_train = np.array(y_train, dtype=object)[index]
                del index
            recognizer.fit(X_train, y_train, validation_data=(X, y))
        with open(model_name, 'wb') as fp:
            pickle.dump(recognizer, fp)
        print('')
        print(
            'The NER has been successfully fitted and saved into the file `{0}`...'
            .format(model_name))
        print('')
    return recognizer