Beispiel #1
0
def train(factrueval2016_devset_dir: str, split_by_paragraphs: bool,
          elmo_will_be_tuned: bool, max_epochs: int, batch_size: int,
          lr: float, gpu_memory_frac: float, model_name: str) -> ELMo_NER:
    if os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            recognizer = pickle.load(fp)
        assert isinstance(recognizer, ELMo_NER)
        print('The NER has been successfully loaded from the file `{0}`...'.
              format(model_name))
        print('')
    else:
        temp_json_name = tempfile.NamedTemporaryFile(mode='w').name
        try:
            factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name,
                                   split_by_paragraphs)
            X, y = load_dataset(temp_json_name)
        finally:
            if os.path.isfile(temp_json_name):
                os.remove(temp_json_name)
        print('Data for training have been loaded...')
        print('Number of samples is {0}.'.format(len(y)))
        print('')
        max_number_of_tokens = 0
        tokenizer = NISTTokenizer()
        for cur in X:
            n_tokens = len(tokenizer.international_tokenize(cur))
            if n_tokens > max_number_of_tokens:
                max_number_of_tokens = n_tokens
        del tokenizer
        print('Maximal number of tokens is {0}.'.format(max_number_of_tokens))
        n_tokens = 2
        while n_tokens < max_number_of_tokens:
            n_tokens *= 2
        elmo_hub_module_handle = 'http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz'
        recognizer = ELMo_NER(finetune_elmo=elmo_will_be_tuned,
                              batch_size=batch_size,
                              l2_reg=1e-3,
                              max_seq_length=n_tokens,
                              elmo_hub_module_handle=elmo_hub_module_handle,
                              validation_fraction=0.25,
                              max_epochs=max_epochs,
                              patience=5,
                              gpu_memory_frac=gpu_memory_frac,
                              verbose=True,
                              random_seed=42,
                              lr=lr)
        recognizer.fit(X, y)
        with open(model_name, 'wb') as fp:
            pickle.dump(recognizer, fp)
        print('')
        print(
            'The NER has been successfully fitted and saved into the file `{0}`...'
            .format(model_name))
        print('')
    return recognizer
Beispiel #2
0
def load_testset_for_toxic_comments_2017(texts_file_name: str, labels_file_name: str) -> \
        Tuple[List[tuple], List[Set[int]], List[str]]:
    tokenizer = NISTTokenizer()
    header = None
    line_idx = 1
    all_texts = []
    with codecs.open(texts_file_name,
                     mode='r',
                     encoding='utf-8',
                     errors='ignore') as fp:
        reader = csv.reader(fp, quotechar='"', delimiter=',')
        for row in reader:
            if len(row) > 0:
                if header is None:
                    if len(row) != 2:
                        raise ValueError(
                            'File `{0}`, line {1}: header must be consists of 2 columns.'
                            .format(texts_file_name, line_idx))
                    if row[0] != 'id':
                        raise ValueError(
                            'File `{0}`, line {1}: first column of table must be an identifier, '
                            'i.e. `id`.'.format(texts_file_name, line_idx))
                    if row[1] != 'comment_text':
                        raise ValueError(
                            'File `{0}`, line {1}: second column of table must be a source text, '
                            'i.e. `comment_text`.'.format(
                                texts_file_name, line_idx))
                    header = copy.copy(row)
                else:
                    if len(row) != len(header):
                        raise ValueError(
                            'File `{0}`, line {1}: this line does not correspond to the header.'
                            .format(texts_file_name, line_idx))
                    new_text = ' '.join(
                        list(
                            filter(
                                lambda it2: len(it2) > 0,
                                map(lambda it1: it1.strip(), row[1].split()))))
                    all_texts.append(
                        tuple(
                            filter(
                                lambda it2: len(it2) > 0,
                                map(
                                    lambda it1: it1.strip().lower(),
                                    tokenizer.international_tokenize(
                                        text=new_text)))))
            line_idx += 1
    del header
    header = None
    line_idx = 1
    all_labels = []
    classes_list = []
    indices = []
    with codecs.open(labels_file_name,
                     mode='r',
                     encoding='utf-8',
                     errors='ignore') as fp:
        reader = csv.reader(fp, quotechar='"', delimiter=',')
        for row in reader:
            if len(row) > 0:
                if header is None:
                    if len(row) < 2:
                        raise ValueError(
                            'File `{0}`, line {1}: header must be consists of 2 columns, '
                            'at least.'.format(labels_file_name, line_idx))
                    if row[0] != 'id':
                        raise ValueError(
                            'File `{0}`, line {1}: first column of table must be an identifier, '
                            'i.e. `id`.'.format(labels_file_name, line_idx))
                    header = copy.copy(row)
                    classes_list = copy.copy(row[1:])
                    if len(classes_list) != len(set(classes_list)):
                        raise ValueError(
                            'File `{0}`, line {1}: names of classes are duplicated.'
                            .format(labels_file_name, line_idx))
                else:
                    if len(row) != len(header):
                        raise ValueError(
                            'File `{0}`, line {1}: this line does not correspond to the header.'
                            .format(labels_file_name, line_idx))
                    if not all(map(lambda it: it in {'0', '1', '-1'},
                                   row[1:])):
                        raise ValueError(
                            'File `{0}`, line {1}: all labels must be 0, 1 or -1.'
                            .format(labels_file_name, line_idx))
                    new_label = set(
                        filter(lambda idx: row[1 + idx] == '1',
                               range(len(classes_list))))
                    if len(new_label) == 0:
                        if '-1' not in row[1:]:
                            indices.append(len(all_labels))
                    else:
                        indices.append(len(all_labels))
                    all_labels.append(new_label)
    if len(all_texts) != len(all_labels):
        raise ValueError(
            'Number of texts is not equal to number of labels! {0} != {1}'.
            format(len(all_texts), len(all_labels)))
    print('Size of source dataset for final testing is {0}.'.format(
        len(all_texts)))
    print(
        'Size of dataset for final testing after its filtering is {0}.'.format(
            len(indices)))
    return [all_texts[idx] for idx in indices
            ], [all_labels[idx] for idx in indices], classes_list
def train(factrueval2016_devset_dir: str,
          split_by_paragraphs: bool,
          elmo_will_be_tuned: bool,
          max_epochs: int,
          batch_size: int,
          lr: float,
          l2: float,
          gpu_memory_frac: float,
          model_name: str,
          collection3_dir: Union[str, None] = None) -> ELMo_NER:
    if os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            recognizer = pickle.load(fp)
        assert isinstance(recognizer, ELMo_NER)
        print('The NER has been successfully loaded from the file `{0}`...'.
              format(model_name))
        print('')
    else:
        temp_json_name = tempfile.NamedTemporaryFile(mode='w').name
        try:
            factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name,
                                   split_by_paragraphs)
            X, y = load_dataset_from_json(temp_json_name)
        finally:
            if os.path.isfile(temp_json_name):
                os.remove(temp_json_name)
        print('The FactRuEval-2016 data for training have been loaded...')
        print('Number of samples is {0}.'.format(len(y)))
        print('')
        max_number_of_tokens = 0
        tokenizer = NISTTokenizer()
        for cur in X:
            n_tokens = len(tokenizer.international_tokenize(cur))
            if n_tokens > max_number_of_tokens:
                max_number_of_tokens = n_tokens
        del tokenizer
        print('Maximal number of tokens is {0}.'.format(max_number_of_tokens))
        n_tokens = 2
        while n_tokens < max_number_of_tokens:
            n_tokens *= 2
        elmo_hub_module_handle = 'http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz'
        recognizer = ELMo_NER(finetune_elmo=elmo_will_be_tuned,
                              batch_size=batch_size,
                              l2_reg=l2,
                              max_seq_length=n_tokens,
                              elmo_hub_module_handle=elmo_hub_module_handle,
                              validation_fraction=0.25,
                              max_epochs=max_epochs,
                              patience=5,
                              gpu_memory_frac=gpu_memory_frac,
                              verbose=True,
                              random_seed=42,
                              lr=lr)
        if collection3_dir is None:
            recognizer.fit(X, y)
        else:
            X_train, y_train = load_dataset_from_brat(collection3_dir,
                                                      split_by_paragraphs=True)
            if not split_by_paragraphs:
                X_train, y_train = divide_dataset_by_sentences(
                    X_train, y_train)
            for sample_idx in range(len(y_train)):
                new_y_sample = dict()
                for ne_type in sorted(list(y_train[sample_idx].keys())):
                    if ne_type == 'PER':
                        new_y_sample['PERSON'] = y_train[sample_idx][ne_type]
                    elif ne_type == 'LOC':
                        new_y_sample['LOCATION'] = y_train[sample_idx][ne_type]
                    else:
                        new_y_sample[ne_type] = y_train[sample_idx][ne_type]
                y_train[sample_idx] = new_y_sample
                del new_y_sample
            print('The Collection3 data for training have been loaded...')
            print('Number of samples is {0}.'.format(len(y_train)))
            print('')
            recognizer.fit(X_train, y_train, validation_data=(X, y))
        with open(model_name, 'wb') as fp:
            pickle.dump(recognizer, fp)
        print('')
        print(
            'The NER has been successfully fitted and saved into the file `{0}`...'
            .format(model_name))
        print('')
    return recognizer
Beispiel #4
0
def load_trainset_for_toxic_comments_2017(file_name: str) -> \
        Tuple[List[tuple], List[Set[int]], List[str]]:
    tokenizer = NISTTokenizer()
    header = None
    line_idx = 1
    all_texts = []
    all_labels = []
    classes_list = []
    with codecs.open(file_name, mode='r', encoding='utf-8',
                     errors='ignore') as fp:
        reader = csv.reader(fp, quotechar='"', delimiter=',')
        for row in reader:
            if len(row) > 0:
                if header is None:
                    if len(row) < 3:
                        raise ValueError(
                            'File `{0}`, line {1}: header must be consists of 3 columns, '
                            'at least.'.format(file_name, line_idx))
                    if row[0] != 'id':
                        raise ValueError(
                            'File `{0}`, line {1}: first column of table must be an identifier, '
                            'i.e. `id`.'.format(file_name, line_idx))
                    if row[1] != 'comment_text':
                        raise ValueError(
                            'File `{0}`, line {1}: second column of table must be a source text, '
                            'i.e. `comment_text`.'.format(file_name, line_idx))
                    header = copy.copy(row)
                    classes_list = copy.copy(row[2:])
                    if len(classes_list) != len(set(classes_list)):
                        raise ValueError(
                            'File `{0}`, line {1}: names of classes are duplicated.'
                            .format(file_name, line_idx))
                else:
                    if len(row) != len(header):
                        raise ValueError(
                            'File `{0}`, line {1}: this line does not correspond to the header.'
                            .format(file_name, line_idx))
                    new_text = ' '.join(
                        list(
                            filter(
                                lambda it2: len(it2) > 0,
                                map(lambda it1: it1.strip(), row[1].split()))))
                    all_texts.append(
                        tuple(
                            filter(
                                lambda it2: len(it2) > 0,
                                map(
                                    lambda it1: it1.strip().lower(),
                                    tokenizer.international_tokenize(
                                        text=new_text)))))
                    if not all(map(lambda it: it in {'0', '1'}, row[2:])):
                        raise ValueError(
                            'File `{0}`, line {1}: all labels must be 0 or 1.'.
                            format(file_name, line_idx))
                    new_label = set(
                        filter(lambda idx: row[2 + idx] == '1',
                               range(len(classes_list))))
                    all_labels.append(new_label)
            line_idx += 1
    print('Size of the training set is {0}.'.format(len(all_texts)))
    return all_texts, all_labels, classes_list