Esempio n. 1
0
def load_token_embeddings(sess, W, dataset, parameters):
    # Load embeddings
    # https://github.com/dennybritz/cnn-text-classification-tf/issues/17
    print('Load embeddings')
    #full_word_embeddings_folder =os.path.join('..','data','word_vectors')
    #full_word_embeddings_filepath = os.path.join(full_word_embeddings_folder,'glove.6B.{0}d.txt'.format(token_embedding_size))
    file_input = codecs.open(parameters['token_pretrained_embedding_filepath'], 'r', 'UTF-8')
    count = -1
#     case_sensitive = False
#     initial_weights = np.random.uniform(-0.25,0.25,(vocabulary_size, token_embedding_size))
    initial_weights = sess.run(W.read_value())
    token_to_vector = {}
    for cur_line in file_input:
        count += 1
        #if count > 1000:break
        cur_line = cur_line.strip()
        cur_line = cur_line.split(' ')
        if len(cur_line)==0:continue
        token = cur_line[0]
        vector =cur_line[1:]
        token_to_vector[token] = vector

    number_of_loaded_word_vectors = 0
    number_of_token_original_case_found = 0
    number_of_token_lowercase_found = 0
    number_of_token_lowercase_normalized_found = 0
    for token in dataset.token_to_index.keys():
        # TODO: shouldn't it apply to token_to_index instead?
#         if not case_sensitive: token = token.lower()
        # For python 2.7
#         if token not in dataset.token_to_index.viewkeys():continue
        # For python 3.5
        if token in token_to_vector.keys():
            initial_weights[dataset.token_to_index[token]] = token_to_vector[token]
            number_of_token_original_case_found += 1
        elif token.lower() in token_to_vector.keys():
            initial_weights[dataset.token_to_index[token]] = token_to_vector[token.lower()]
            number_of_token_lowercase_found += 1
        elif re.sub('\d', '0', token.lower()) in token_to_vector.keys():
            initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token.lower())]
            number_of_token_lowercase_normalized_found += 1
        else:
            continue
        number_of_loaded_word_vectors += 1
    file_input.close()
    print("number_of_token_original_case_found: {0}".format(number_of_token_original_case_found))
    print("number_of_token_lowercase_found: {0}".format(number_of_token_lowercase_found))
    print("number_of_token_lowercase_normalized_found: {0}".format(number_of_token_lowercase_normalized_found))
    print('number_of_loaded_word_vectors: {0}'.format(number_of_loaded_word_vectors))
    print("len(dataset.token_to_index): {0}".format(len(dataset.token_to_index)))
    print("len(dataset.index_to_token): {0}".format(len(dataset.index_to_token)))
#     sess.run(tf.global_variables_initializer())
    sess.run(W.assign(initial_weights))
    print('Load embeddings completed')
Esempio n. 2
0
def printtoken(type, token, srow_scol, erow_ecol, line):
    tok = " "
    srow, scol = srow_scol
    erow, ecol = erow_ecol

    with open(filename, "a+") as outfile:
        if tok_name[type] == "NEWLINE" or tok_name[type] == "NL" or tok_name[
                type] == "DEDENT" or tok_name[type] == "ENDMARKER" or tok_name[
                    type] == "INDENT" or tok_name[type] == "COMMENT":
            pass
        elif tok_name[type] == "STRING" or tok_name[
                type] == "NAME" or tok_name[type] == "NUMBER":
            outfile.write(token.lower() + '\n')
Esempio n. 3
0
    def load_dataset(self,
                     dataset_filepaths,
                     parameters,
                     token_to_vector=None):
        '''
        dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy'
        '''
        start_time = time.time()
        print('Load dataset... ', end='', flush=True)
        if parameters['token_pretrained_embedding_filepath'] != '':
            if token_to_vector == None:
                token_to_vector = utils_nlp.load_pretrained_token_embeddings(
                    parameters)
        else:
            token_to_vector = {}
        if self.verbose:
            print("len(token_to_vector): {0}".format(len(token_to_vector)))

        # Load pretraining dataset to ensure that index to label is compatible to the pretrained model,
        #   and that token embeddings that are learned in the pretrained model are loaded properly.
        all_tokens_in_pretraining_dataset = []
        all_characters_in_pretraining_dataset = []
        if parameters['use_pretrained_model']:
            try:
                pretraining_dataset = pickle.load(
                    open(
                        os.path.join(parameters['pretrained_model_folder'],
                                     'dataset.pickle'), 'rb'))
            except:
                pretraining_dataset = utils.renamed_load(
                    open(
                        os.path.join(parameters['pretrained_model_folder'],
                                     'dataset.pickle'), 'rb'))
            all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values(
            )
            all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values(
            )

        remap_to_unk_count_threshold = 1
        self.UNK_TOKEN_INDEX = 0
        self.PADDING_CHARACTER_INDEX = 0
        self.tokens_mapped_to_unk = []
        self.UNK = 'UNK'
        self.unique_labels = []
        labels = {}
        tokens = {}
        label_count = {}
        token_count = {}
        character_count = {}
        for dataset_type in ['train', 'valid', 'test', 'deploy']:
            labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \
                = self._parse_dataset(dataset_filepaths.get(dataset_type, None))

            if self.verbose:
                print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        token_count['all'] = {}
        for token in list(token_count['train'].keys()) + list(
                token_count['valid'].keys()) + list(
                    token_count['test'].keys()) + list(
                        token_count['deploy'].keys()):
            token_count['all'][token] = token_count['train'][
                token] + token_count['valid'][token] + token_count['test'][
                    token] + token_count['deploy'][token]

        if parameters['load_all_pretrained_token_embeddings']:
            for token in token_to_vector:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1
            for token in all_tokens_in_pretraining_dataset:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1

        character_count['all'] = {}
        for character in list(character_count['train'].keys()) + list(
                character_count['valid'].keys()) + list(
                    character_count['test'].keys()) + list(
                        character_count['deploy'].keys()):
            character_count['all'][character] = character_count['train'][
                character] + character_count['valid'][
                    character] + character_count['test'][
                        character] + character_count['deploy'][character]

        for character in all_characters_in_pretraining_dataset:
            if character not in character_count['all']:
                character_count['all'][character] = -1
                character_count['train'][character] = -1

        for dataset_type in dataset_filepaths.keys():
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        label_count['all'] = {}
        for character in list(label_count['train'].keys()) + list(
                label_count['valid'].keys()) + list(
                    label_count['test'].keys()) + list(
                        label_count['deploy'].keys()):
            label_count['all'][character] = label_count['train'][
                character] + label_count['valid'][character] + label_count[
                    'test'][character] + label_count['deploy'][character]

        token_count['all'] = utils.order_dictionary(token_count['all'],
                                                    'value_key',
                                                    reverse=True)
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)
        character_count['all'] = utils.order_dictionary(character_count['all'],
                                                        'value',
                                                        reverse=True)
        if self.verbose:
            print('character_count[\'all\']: {0}'.format(
                character_count['all']))

        token_to_index = {}
        token_to_index[self.UNK] = self.UNK_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0
        if self.verbose:
            print("parameters['remap_unknown_tokens_to_unk']: {0}".format(
                parameters['remap_unknown_tokens_to_unk']))
        if self.verbose:
            print("len(token_count['train'].keys()): {0}".format(
                len(token_count['train'].keys())))
        for token, count in token_count['all'].items():
            if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1

            if parameters['remap_unknown_tokens_to_unk'] == 1 and \
                (token_count['train'][token] == 0 or \
                parameters['load_only_pretrained_token_embeddings']) and \
                not utils_nlp.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \
                token not in all_tokens_in_pretraining_dataset:
                if self.verbose:
                    print("token: {0}".format(token))
                if self.verbose:
                    print("token.lower(): {0}".format(token.lower()))
                if self.verbose:
                    print("re.sub('\d', '0', token.lower()): {0}".format(
                        re.sub('\d', '0', token.lower())))
                token_to_index[token] = self.UNK_TOKEN_INDEX
                number_of_unknown_tokens += 1
                self.tokens_mapped_to_unk.append(token)
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1
        if self.verbose:
            print("number_of_unknown_tokens: {0}".format(
                number_of_unknown_tokens))

        infrequent_token_indices = []
        for token, count in token_count['train'].items():
            if 0 < count <= remap_to_unk_count_threshold:
                infrequent_token_indices.append(token_to_index[token])
        if self.verbose:
            print("len(token_count['train']): {0}".format(
                len(token_count['train'])))
        if self.verbose:
            print("len(infrequent_token_indices): {0}".format(
                len(infrequent_token_indices)))

        # Ensure that both B- and I- versions exist for each label
        labels_without_bio = set()
        for label in label_count['all'].keys():
            new_label = utils_nlp.remove_bio_from_label_name(label)
            labels_without_bio.add(new_label)
        for label in labels_without_bio:
            if label == 'O':
                continue
            if parameters['tagging_format'] == 'bioes':
                prefixes = ['B-', 'I-', 'E-', 'S-']
            else:
                prefixes = ['B-', 'I-']
            for prefix in prefixes:
                l = prefix + label
                if l not in label_count['all']:
                    label_count['all'][l] = 0
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)

        if parameters['use_pretrained_model']:
            self.unique_labels = sorted(
                list(pretraining_dataset.label_to_index.keys()))
            # Make sure labels are compatible with the pretraining dataset.
            for label in label_count['all']:
                if label not in pretraining_dataset.label_to_index:
                    raise AssertionError(
                        "The label {0} does not exist in the pretraining dataset. "
                        .format(label) +
                        "Please ensure that only the following labels exist in the dataset: {0}"
                        .format(', '.join(self.unique_labels)))
            label_to_index = pretraining_dataset.label_to_index.copy()
        else:
            label_to_index = {}
            iteration_number = 0
            for label, count in label_count['all'].items():
                label_to_index[label] = iteration_number
                iteration_number += 1
                self.unique_labels.append(label)

        if self.verbose:
            print('self.unique_labels: {0}'.format(self.unique_labels))

        character_to_index = {}
        iteration_number = 0
        for character, count in character_count['all'].items():
            if iteration_number == self.PADDING_CHARACTER_INDEX:
                iteration_number += 1
            character_to_index[character] = iteration_number
            iteration_number += 1

        if self.verbose:
            print('token_count[\'train\'][0:10]: {0}'.format(
                list(token_count['train'].items())[0:10]))
        token_to_index = utils.order_dictionary(token_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose:
            print('token_to_index: {0}'.format(token_to_index))
        index_to_token = utils.reverse_dictionary(token_to_index)
        if parameters['remap_unknown_tokens_to_unk'] == 1:
            index_to_token[self.UNK_TOKEN_INDEX] = self.UNK
        if self.verbose:
            print('index_to_token: {0}'.format(index_to_token))

        if self.verbose:
            print('label_count[\'train\']: {0}'.format(label_count['train']))
        label_to_index = utils.order_dictionary(label_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose:
            print('label_to_index: {0}'.format(label_to_index))
        index_to_label = utils.reverse_dictionary(label_to_index)
        if self.verbose:
            print('index_to_label: {0}'.format(index_to_label))

        character_to_index = utils.order_dictionary(character_to_index,
                                                    'value',
                                                    reverse=False)
        index_to_character = utils.reverse_dictionary(character_to_index)
        if self.verbose:
            print('character_to_index: {0}'.format(character_to_index))
        if self.verbose:
            print('index_to_character: {0}'.format(index_to_character))

        if self.verbose:
            print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10]))
        if self.verbose:
            print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10]))

        if self.verbose:
            # Print sequences of length 1 in train set
            for token_sequence, label_sequence in zip(tokens['train'],
                                                      labels['train']):
                if len(label_sequence) == 1 and label_sequence[0] != 'O':
                    print("{0}\t{1}".format(token_sequence[0],
                                            label_sequence[0]))

        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.index_to_character = index_to_character
        self.character_to_index = character_to_index
        self.index_to_label = index_to_label
        self.label_to_index = label_to_index
        if self.verbose:
            print("len(self.token_to_index): {0}".format(
                len(self.token_to_index)))
        if self.verbose:
            print("len(self.index_to_token): {0}".format(
                len(self.index_to_token)))
        self.tokens = tokens
        self.labels = labels

        token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices(
            dataset_filepaths.keys())

        self.token_indices = token_indices
        self.label_indices = label_indices
        self.character_indices_padded = character_indices_padded
        self.character_indices = character_indices
        self.token_lengths = token_lengths
        self.characters = characters
        self.label_vector_indices = label_vector_indices

        self.number_of_classes = max(self.index_to_label.keys()) + 1
        self.vocabulary_size = max(self.index_to_token.keys()) + 1
        self.alphabet_size = max(self.index_to_character.keys()) + 1
        if self.verbose:
            print("self.number_of_classes: {0}".format(self.number_of_classes))
        if self.verbose:
            print("self.alphabet_size: {0}".format(self.alphabet_size))
        if self.verbose:
            print("self.vocabulary_size: {0}".format(self.vocabulary_size))

        # unique_labels_of_interest is used to compute F1-scores.
        self.unique_labels_of_interest = list(self.unique_labels)
        self.unique_labels_of_interest.remove('O')

        self.unique_label_indices_of_interest = []
        for lab in self.unique_labels_of_interest:
            self.unique_label_indices_of_interest.append(label_to_index[lab])

        self.infrequent_token_indices = infrequent_token_indices

        if self.verbose:
            print('self.unique_labels_of_interest: {0}'.format(
                self.unique_labels_of_interest))
        if self.verbose:
            print('self.unique_label_indices_of_interest: {0}'.format(
                self.unique_label_indices_of_interest))

        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))

        return token_to_vector
Esempio n. 4
0
 def s_other(scanner, token): 
     return (Scanner.OTHER, token.lower().strip())
Esempio n. 5
0
 def s_suiteintro(scanner, token): 
     return (Scanner.SUITEINTRO,token.lower().strip(',').strip())
Esempio n. 6
0
 def s_phrase(scanner, token): 
     """A Comma delimited word phrase"""
     return (Scanner.PHRASE,token.lower().strip(',').strip())
Esempio n. 7
0
 def s_word(scanner, token): 
     return (Scanner.WORD, token.lower().strip('.'))
Esempio n. 8
0
    def _parse_dataset(self, dataset_filepath, parameters):
        token_count = collections.defaultdict(lambda: 0)
        label_count = collections.defaultdict(lambda: 0)
        character_count = collections.defaultdict(lambda: 0)
        if parameters['use_pos']:
            pos_tag_count = collections.defaultdict(lambda: 0)
        if parameters['use_gaz']:
            gaz_count = collections.defaultdict(lambda: 0)
            #self._parse_gaz(parameters['gaz_filepath'])
        if parameters['use_aff']:
            aff_count = collections.defaultdict(lambda: 0)

        line_count = -1
        tokens = []
        labels = []
        pos_tags = []
        new_token_sequence = []
        new_label_sequence = []
        if parameters['use_pos']:
            new_pos_tag_sequence = []
        if parameters['use_gaz']:
            new_gaz_sequence = []
            gazs = []
        if parameters['use_aff']:
            new_aff_sequence = []
            affs = []
        if dataset_filepath:
            f = codecs.open(dataset_filepath, 'r', 'UTF-8')
            for line in f:
                line_count += 1
                line = line.strip().split(' ')
                if len(line) == 0 or len(
                        line[0]) == 0 or '-DOCSTART-' in line[0]:
                    if len(new_token_sequence) > 0:
                        labels.append(new_label_sequence)
                        tokens.append(new_token_sequence)
                        if parameters['use_pos']:
                            pos_tags.append(new_pos_tag_sequence)
                        if parameters['use_gaz']:
                            gazs.append(new_gaz_sequence)
                        if parameters['use_aff']:
                            affs.append(new_aff_sequence)
                            new_aff_sequence = []
                        new_token_sequence = []
                        new_label_sequence = []
                        new_pos_tag_sequence = []
                        new_gaz_sequence = []
                    continue
                token = str(line[0])
                label = str(line[-1])
                # beware: in both cases we are assuming bioes
                if parameters['use_pos']:
                    '''
                    if parameters['tokenizer'] == 'pos':
                        pos_tag = str(line[-2])
                    else:
                        pos_tag = str(line[-3])
                    '''
                    if parameters['tokenizer'] == 'pos':
                        pos_tag = str(line[-3])
                    else:
                        pos_tag = str(line[-4])
                    #print(pos_tag)
                if parameters['use_gaz']:
                    gaz = token.lower() in self.gaz_set
                    if gaz:
                        gaz = 1
                    else:
                        gaz = 0
                if parameters['use_aff']:
                    aff = 0
                    # Check for prefix
                    for pref in self.aff_set['prefix']:
                        pattern = '^' + re.escape(pref.lower())
                        result = re.match(pattern, token.lower())
                        if result:
                            aff = 1

                    for suf in self.aff_set['suffix']:
                        pattern = re.escape(suf.lower()) + '$'
                        result = re.match(pattern, token.lower())
                        if result:
                            aff = 1

                    for rot in self.aff_set['root']:
                        result = token.lower().find(rot)
                        if result > 1:
                            aff = 1

                token_count[token] += 1
                label_count[label] += 1
                if parameters['use_pos']:
                    pos_tag_count[pos_tag] += 1

                if parameters['use_gaz']:
                    gaz_count[gaz] += 1

                if parameters['use_aff']:
                    aff_count[aff] += 1

                new_token_sequence.append(token)
                new_label_sequence.append(label)
                if parameters['use_pos']:
                    new_pos_tag_sequence.append(pos_tag)
                if parameters['use_gaz']:
                    new_gaz_sequence.append(gaz)
                if parameters['use_aff']:
                    new_aff_sequence.append(aff)

                for character in token:
                    character_count[character] += 1

                if self.debug and line_count > 200:
                    break  # for debugging purposes

            if len(new_token_sequence) > 0:
                labels.append(new_label_sequence)
                tokens.append(new_token_sequence)
                if parameters['use_pos']:
                    pos_tags.append(new_pos_tag_sequence)
                if parameters['use_gaz']:
                    gazs.append(new_gaz_sequence)
                if parameters['use_aff']:
                    affs.append(new_aff_sequence)
            f.close()
        if not parameters['use_pos']:
            pos_tags = None
            pos_tag_count = None
        if not parameters['use_gaz']:
            gazs = None
            gaz_count = None
        if not parameters['use_aff']:
            affs = None
            aff_count = None
        return labels, tokens, token_count, label_count, character_count, pos_tags, pos_tag_count, gazs, gaz_count, affs, aff_count
Esempio n. 9
0
    def load_dataset(self,
                     dataset_filepaths,
                     parameters,
                     token_to_vector=None):
        '''
        dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy'
        Load word vectors từ file đã chuẩn bị sẵn
        '''
        start_time = time.time()
        print('Load dataset... ', end='', flush=True)
        if parameters['token_pretrained_embedding_filepath'] != '':
            if token_to_vector == None:
                token_to_vector = utils_nlp.load_pretrained_token_embeddings(
                    parameters)
        else:
            token_to_vector = {}
        if self.verbose:
            print("len(token_to_vector): {0}".format(len(token_to_vector)))

        # Load pretraining dataset to ensure that index to label is compatible to the pretrained model,
        #   and that token embeddings that are learned in the pretrained model are loaded properly.
        all_tokens_in_pretraining_dataset = []
        all_characters_in_pretraining_dataset = []
        if parameters['use_pretrained_model']:
            pretraining_dataset = pickle.load(
                open(
                    os.path.join(parameters['pretrained_model_folder'],
                                 'dataset.pickle'), 'rb'))
            all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values(
            )  # Những token lưu ở đợt train trước
            all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values(
            )  # Những character lưu ở đợt train trước

        remap_to_unk_count_threshold = 1
        self.UNK_TOKEN_INDEX = 0  # Index của những unknow token
        self.PADDING_CHARACTER_INDEX = 0
        self.tokens_mapped_to_unk = []  # những unknown token
        self.UNK = 'UNK'
        self.unique_labels = []  # Các nhãn tồn tại trong dataset
        labels = {}  # nhãn {all: ...., train: ..., test: ...}
        tokens = {}  # token {all: ...., train: ..., test: ...}
        label_count = {}  # Đếm số nhãn {all: ...., train: ..., test: ...}
        token_count = {}  # Đếm số token {all: ...., train: ..., test: ...}
        character_count = {}  # Đếm số ký tự {all: ...., train: ..., test: ...}
        for dataset_type in ['train', 'valid', 'test', 'deploy']:
            labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \
                = self._parse_dataset(dataset_filepaths.get(dataset_type, None))

            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        # Tính tổng hợp lại cho tất cả các dataset
        token_count['all'] = {}
        for token in list(token_count['train'].keys()) + list(
                token_count['valid'].keys()) + list(
                    token_count['test'].keys()) + list(
                        token_count['deploy'].keys()):
            token_count['all'][token] = token_count['train'][
                token] + token_count['valid'][token] + token_count['test'][
                    token] + token_count['deploy'][token]

        # Thêm những token ở pretrained trước với giá trị -1
        if parameters['load_all_pretrained_token_embeddings']:
            for token in token_to_vector:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1
            for token in all_tokens_in_pretraining_dataset:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1

        # Tính tổng hợp lại cho tất cả các dataset
        character_count['all'] = {}
        for character in list(character_count['train'].keys()) + list(
                character_count['valid'].keys()) + list(
                    character_count['test'].keys()) + list(
                        character_count['deploy'].keys()):
            character_count['all'][character] = character_count['train'][
                character] + character_count['valid'][
                    character] + character_count['test'][
                        character] + character_count['deploy'][character]

        # Thêm những token ở pretrained trước với giá trị -1
        for character in all_characters_in_pretraining_dataset:
            if character not in character_count['all']:
                character_count['all'][character] = -1
                character_count['train'][character] = -1

        for dataset_type in dataset_filepaths.keys():
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        # Tính tổng hợp lại các nhãn ở đợt train trước
        label_count['all'] = {}
        for character in list(label_count['train'].keys()) + list(
                label_count['valid'].keys()) + list(
                    label_count['test'].keys()) + list(
                        label_count['deploy'].keys()):
            label_count['all'][character] = label_count['train'][
                character] + label_count['valid'][character] + label_count[
                    'test'][character] + label_count['deploy'][character]

        token_count['all'] = utils.order_dictionary(
            token_count['all'], 'value_key', reverse=True
        )  # Sort token count theo các token có freq cao đến thấp, token desc
        label_count['all'] = utils.order_dictionary(
            label_count['all'], 'key',
            reverse=False)  # Sort label count theo label asc
        character_count['all'] = utils.order_dictionary(
            character_count['all'], 'value', reverse=True
        )  # Sort character count theo các character có freq cao đến thấp
        if self.verbose:
            print('character_count[\'all\']: {0}'.format(
                character_count['all']))

        token_to_index = {}
        token_to_index[self.UNK] = self.UNK_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0
        if self.verbose:
            print("parameters['remap_unknown_tokens_to_unk']: {0}".format(
                parameters['remap_unknown_tokens_to_unk']))
        if self.verbose:
            print("len(token_count['train'].keys()): {0}".format(
                len(token_count['train'].keys())))
        for token, count in token_count['all'].items():
            if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1
            '''
            UNK_TOKEN: token không xuất hiện trong pretraining_dataset và trong word vectors
            '''
            if parameters['remap_unknown_tokens_to_unk'] == 1 and \
                (token_count['train'][token] == 0 or \
                parameters['load_only_pretrained_token_embeddings']) and \
                not utils_nlp.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \
                token not in all_tokens_in_pretraining_dataset:
                if self.verbose: print("token: {0}".format(token))
                if self.verbose:
                    print("token.lower(): {0}".format(token.lower()))
                if self.verbose:
                    print("re.sub('\d', '0', token.lower()): {0}".format(
                        re.sub('\d', '0', token.lower())))
                token_to_index[token] = self.UNK_TOKEN_INDEX
                number_of_unknown_tokens += 1
                self.tokens_mapped_to_unk.append(token)
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1
        if self.verbose:
            print("number_of_unknown_tokens: {0}".format(
                number_of_unknown_tokens))

        infrequent_token_indices = [
        ]  # Các token xuất hiện thấp trong train dataset
        for token, count in token_count['train'].items():
            if 0 < count <= remap_to_unk_count_threshold:
                infrequent_token_indices.append(token_to_index[token])
        if self.verbose:
            print("len(token_count['train']): {0}".format(
                len(token_count['train'])))
        if self.verbose:
            print("len(infrequent_token_indices): {0}".format(
                len(infrequent_token_indices)))

        # Ensure that both B- and I- versions exist for each label
        # Bỏ các tiền tố B-, O-, I-...
        labels_without_bio = set()
        for label in label_count['all'].keys():
            new_label = utils_nlp.remove_bio_from_label_name(label)
            labels_without_bio.add(new_label)

        # Kết hợp các ENTITY vs các tiền tố B-, I-,... và thêm vào label count
        for label in labels_without_bio:
            if label == 'O':
                continue
            if parameters['tagging_format'] == 'bioes':
                prefixes = ['B-', 'I-', 'E-', 'S-']
            else:
                prefixes = ['B-', 'I-']
            for prefix in prefixes:
                l = prefix + label
                if l not in label_count['all']:
                    label_count['all'][l] = 0
        # Sắp xếp label_count theo label asc
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)

        if parameters['use_pretrained_model']:
            self.unique_labels = sorted(
                list(pretraining_dataset.label_to_index.keys()))
            # Make sure labels are compatible with the pretraining dataset.
            for label in label_count['all']:
                if label not in pretraining_dataset.label_to_index:
                    raise AssertionError(
                        "The label {0} does not exist in the pretraining dataset. "
                        .format(label) +
                        "Please ensure that only the following labels exist in the dataset: {0}"
                        .format(', '.join(self.unique_labels)))
            label_to_index = pretraining_dataset.label_to_index.copy()
        else:
            label_to_index = {}
            iteration_number = 0
            for label, count in label_count['all'].items():
                label_to_index[label] = iteration_number
                iteration_number += 1
                self.unique_labels.append(label)

        if self.verbose:
            print('self.unique_labels: {0}'.format(self.unique_labels))

        character_to_index = {}
        iteration_number = 0
        for character, count in character_count['all'].items():
            if iteration_number == self.PADDING_CHARACTER_INDEX:
                iteration_number += 1
            character_to_index[character] = iteration_number
            iteration_number += 1

        if self.verbose:
            print('token_count[\'train\'][0:10]: {0}'.format(
                list(token_count['train'].items())[0:10]))
        token_to_index = utils.order_dictionary(token_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('token_to_index: {0}'.format(token_to_index))
        index_to_token = utils.reverse_dictionary(token_to_index)
        if parameters['remap_unknown_tokens_to_unk'] == 1:
            index_to_token[self.UNK_TOKEN_INDEX] = self.UNK
        if self.verbose: print('index_to_token: {0}'.format(index_to_token))

        if self.verbose:
            print('label_count[\'train\']: {0}'.format(label_count['train']))
        label_to_index = utils.order_dictionary(label_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('label_to_index: {0}'.format(label_to_index))
        index_to_label = utils.reverse_dictionary(label_to_index)
        if self.verbose: print('index_to_label: {0}'.format(index_to_label))

        character_to_index = utils.order_dictionary(character_to_index,
                                                    'value',
                                                    reverse=False)
        index_to_character = utils.reverse_dictionary(character_to_index)
        if self.verbose:
            print('character_to_index: {0}'.format(character_to_index))
        if self.verbose:
            print('index_to_character: {0}'.format(index_to_character))

        if self.verbose:
            print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10]))
        if self.verbose:
            print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10]))

        if self.verbose:
            # Print sequences of length 1 in train set
            for token_sequence, label_sequence in zip(tokens['train'],
                                                      labels['train']):
                if len(label_sequence) == 1 and label_sequence[0] != 'O':
                    print("{0}\t{1}".format(token_sequence[0],
                                            label_sequence[0]))

        self.token_to_index = token_to_index  # {token: index sau khi sắp xếp theo freq từ cao đến thấp, 0 nếu là unk token}
        self.index_to_token = index_to_token  # Ngược token_to_index

        self.index_to_character = index_to_character  # Ngược character_to_index
        self.character_to_index = character_to_index  # { character: index sau khi sắp xếp freq từ cao đến thấp}

        self.index_to_label = index_to_label  # Ngược label_to_index
        self.label_to_index = label_to_index  # {label: index sau khi sắp xếp asc}

        if self.verbose:
            print("len(self.token_to_index): {0}".format(
                len(self.token_to_index)))
        if self.verbose:
            print("len(self.index_to_token): {0}".format(
                len(self.index_to_token)))
        self.tokens = tokens
        self.labels = labels

        token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices(
            dataset_filepaths.keys())

        self.token_indices = token_indices
        self.label_indices = label_indices
        self.character_indices_padded = character_indices_padded
        self.character_indices = character_indices
        self.token_lengths = token_lengths
        self.characters = characters
        self.label_vector_indices = label_vector_indices

        self.number_of_classes = max(self.index_to_label.keys()) + 1
        self.vocabulary_size = max(self.index_to_token.keys()) + 1
        self.alphabet_size = max(self.index_to_character.keys()) + 1
        if self.verbose:
            print("self.number_of_classes: {0}".format(self.number_of_classes))
        if self.verbose:
            print("self.alphabet_size: {0}".format(self.alphabet_size))
        if self.verbose:
            print("self.vocabulary_size: {0}".format(self.vocabulary_size))

        # unique_labels_of_interest is used to compute F1-scores.
        self.unique_labels_of_interest = list(self.unique_labels)
        self.unique_labels_of_interest.remove('O')

        self.unique_label_indices_of_interest = []
        for lab in self.unique_labels_of_interest:
            self.unique_label_indices_of_interest.append(label_to_index[lab])

        self.infrequent_token_indices = infrequent_token_indices

        if self.verbose:
            print('self.unique_labels_of_interest: {0}'.format(
                self.unique_labels_of_interest))
        if self.verbose:
            print('self.unique_label_indices_of_interest: {0}'.format(
                self.unique_label_indices_of_interest))

        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))

        return token_to_vector
Esempio n. 10
0
def get_inputs(dataset, token2idx, char2idx, label2idx, config):

    dataset_filepath = None
    if dataset == 'train':
        dataset_filepath = config.path_train
    elif dataset == 'eval':
        dataset_filepath = config.path_eval
    elif dataset == 'test':
        dataset_filepath = config.path_test
    else:
        print("unknown dataset: ", dataset)

    separator = config.separator
    lowercase = config.lowercase

    # collection per sentence
    # format [[[char_idxs], word_idx], ...]
    sentence_token = []
    # format [[label], ...]
    sentence_label = []

    # format [[sentence1_token], [sentence2_token], ...]
    tokens = []
    # format [[sentence1_label], [sentence2_label], ...]
    labels = []

    # go throught whole CoNLL file
    f = codecs.open(dataset_filepath, 'r', 'UTF-8')
    for line in f:
        line = line.strip().split(separator)
        # encouter a new sentence
        if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
            if len(sentence_token) > 0:
                labels.append(sentence_label)
                tokens.append(sentence_token)
                sentence_label = []
                sentence_token = []
            continue

        token = str(line[0])
        label = str(line[-1])
        # 1. preprocess word
        if lowercase:
            word = token.lower()
        else:
            word = token
        # don't use NUM


#         if word.isdigit():
#             word = NUM

# char idxs
        char_idxs = []
        for char in word:
            if char in char2idx:
                char_idxs += [char2idx[char]]
            else:
                print("encounter UNK char:", char)

        # word idx
        if word in token2idx:
            word_idx = token2idx[word]
        else:
            word_idx = token2idx['$UNK$']

        # label idx
        if label in label2idx:
            label_idx = label2idx[label]
        else:
            print("encounter UNK label:", label)

        sentence_token.append((char_idxs, word_idx))
        sentence_label.append(label_idx)

    if len(sentence_token) > 0:
        tokens.append(sentence_token)
        labels.append(sentence_label)

    f.close()

    return tokens, labels
Esempio n. 11
0
    def load_dataset(self, dataset_filepaths, parameters):
        '''
        dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy'
        '''
        start_time = time.time()
        print('Load dataset... ', end='', flush=True)
        all_pretrained_tokens = []
        if parameters['token_pretrained_embedding_filepath'] != '':
            all_pretrained_tokens = utils_nlp.load_tokens_from_pretrained_token_embeddings(
                parameters)
        if self.verbose:
            print("len(all_pretrained_tokens): {0}".format(
                len(all_pretrained_tokens)))

        # Load pretraining dataset to ensure that index to label is compatible to the pretrained model,
        #   and that token embeddings that are learned in the pretrained model are loaded properly.
        all_tokens_in_pretraining_dataset = []
        if parameters['use_pretrained_model']:
            pretrained_model_folder = os.path.dirname(
                parameters['pretrained_model_checkpoint_filepath'])
            pretraining_dataset = pickle.load(
                open(os.path.join(pretrained_model_folder, 'dataset.pickle'),
                     'rb'))
            all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values(
            )

        remap_to_unk_count_threshold = 1
        self.UNK_TOKEN_INDEX = 0
        self.PADDING_CHARACTER_INDEX = 0
        self.tokens_mapped_to_unk = []
        self.UNK = 'UNK'
        self.unique_labels = []
        labels = {}
        tokens = {}
        characters = {}
        token_lengths = {}
        label_count = {}
        token_count = {}
        character_count = {}
        for dataset_type in ['train', 'valid', 'test', 'deploy']:
            labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \
                = self._parse_dataset(dataset_filepaths.get(dataset_type, None))

            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        token_count['all'] = {}
        for token in list(token_count['train'].keys()) + list(
                token_count['valid'].keys()) + list(
                    token_count['test'].keys()) + list(
                        token_count['deploy'].keys()):
            token_count['all'][token] = token_count['train'][
                token] + token_count['valid'][token] + token_count['test'][
                    token] + token_count['deploy'][token]

        for dataset_type in dataset_filepaths.keys():
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        character_count['all'] = {}
        for character in list(character_count['train'].keys()) + list(
                character_count['valid'].keys()) + list(
                    character_count['test'].keys()) + list(
                        character_count['deploy'].keys()):
            character_count['all'][character] = character_count['train'][
                character] + character_count['valid'][
                    character] + character_count['test'][
                        character] + character_count['deploy'][character]

        label_count['all'] = {}
        for character in list(label_count['train'].keys()) + list(
                label_count['valid'].keys()) + list(
                    label_count['test'].keys()) + list(
                        label_count['deploy'].keys()):
            label_count['all'][character] = label_count['train'][
                character] + label_count['valid'][character] + label_count[
                    'test'][character] + label_count['deploy'][character]

        token_count['all'] = utils.order_dictionary(token_count['all'],
                                                    'value_key',
                                                    reverse=True)
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)
        character_count['all'] = utils.order_dictionary(character_count['all'],
                                                        'value',
                                                        reverse=True)
        if self.verbose:
            print('character_count[\'all\']: {0}'.format(
                character_count['all']))

        token_to_index = {}
        token_to_index[self.UNK] = self.UNK_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0
        if self.verbose:
            print("parameters['remap_unknown_tokens_to_unk']: {0}".format(
                parameters['remap_unknown_tokens_to_unk']))
        if self.verbose:
            print("len(token_count['train'].keys()): {0}".format(
                len(token_count['train'].keys())))
        for token, count in token_count['all'].items():
            if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1

            if parameters['remap_unknown_tokens_to_unk'] == 1 and \
                token_count['train'][token] == 0 and \
                (token not in all_pretrained_tokens and \
                token.lower() not in all_pretrained_tokens and \
                re.sub('\d', '0', token.lower()) not in all_pretrained_tokens) and \
                token not in all_tokens_in_pretraining_dataset:

                if self.verbose: print("token: {0}".format(token))
                if self.verbose:
                    print("token.lower(): {0}".format(token.lower()))
                if self.verbose:
                    print("re.sub('\d', '0', token.lower()): {0}".format(
                        re.sub('\d', '0', token.lower())))
                token_to_index[token] = self.UNK_TOKEN_INDEX
                number_of_unknown_tokens += 1
                self.tokens_mapped_to_unk.append(token)
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1
        if self.verbose:
            print("number_of_unknown_tokens: {0}".format(
                number_of_unknown_tokens))

        infrequent_token_indices = []
        for token, count in token_count['train'].items():
            if 0 < count <= remap_to_unk_count_threshold:
                infrequent_token_indices.append(token_to_index[token])
        if self.verbose:
            print("len(token_count['train']): {0}".format(
                len(token_count['train'])))
        if self.verbose:
            print("len(infrequent_token_indices): {0}".format(
                len(infrequent_token_indices)))

        # Ensure that both B- and I- versions exist for each label
        labels_without_bio = set()
        for label in label_count['all'].keys():
            new_label = utils_nlp.remove_bio_from_label_name(label)
            labels_without_bio.add(new_label)
        for label in labels_without_bio:
            if label == 'O':
                continue
            begin_label = 'B-' + label
            inside_label = 'I-' + label
            for l in [begin_label, inside_label]:
                if l not in label_count['all']:
                    label_count['all'][l] = 0
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)

        if parameters['use_pretrained_model']:
            self.unique_labels = sorted(
                list(pretraining_dataset.label_to_index.keys()))
            # Make sure labels are compatible with the pretraining dataset.
            for label in label_count['all']:
                if label not in pretraining_dataset.label_to_index:
                    raise AssertionError(
                        "The label {0} does not exist in the pretraining dataset. "
                        .format(label) +
                        "Please ensure that only the following labels exist in the dataset: {0}"
                        .format(', '.join(self.unique_labels)))
            label_to_index = pretraining_dataset.label_to_index.copy()
        else:
            label_to_index = {}
            iteration_number = 0
            for label, count in label_count['all'].items():
                label_to_index[label] = iteration_number
                iteration_number += 1
                self.unique_labels.append(label)

        if self.verbose:
            print('self.unique_labels: {0}'.format(self.unique_labels))

        character_to_index = {}
        iteration_number = 0
        for character, count in character_count['all'].items():
            if iteration_number == self.PADDING_CHARACTER_INDEX:
                iteration_number += 1
            character_to_index[character] = iteration_number
            iteration_number += 1

        if self.verbose:
            print('token_count[\'train\'][0:10]: {0}'.format(
                list(token_count['train'].items())[0:10]))
        token_to_index = utils.order_dictionary(token_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('token_to_index: {0}'.format(token_to_index))
        index_to_token = utils.reverse_dictionary(token_to_index)
        if parameters['remap_unknown_tokens_to_unk'] == 1:
            index_to_token[self.UNK_TOKEN_INDEX] = self.UNK
        if self.verbose: print('index_to_token: {0}'.format(index_to_token))

        if self.verbose:
            print('label_count[\'train\']: {0}'.format(label_count['train']))
        label_to_index = utils.order_dictionary(label_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('label_to_index: {0}'.format(label_to_index))
        index_to_label = utils.reverse_dictionary(label_to_index)
        if self.verbose: print('index_to_label: {0}'.format(index_to_label))

        character_to_index = utils.order_dictionary(character_to_index,
                                                    'value',
                                                    reverse=False)
        index_to_character = utils.reverse_dictionary(character_to_index)
        if self.verbose:
            print('character_to_index: {0}'.format(character_to_index))
        if self.verbose:
            print('index_to_character: {0}'.format(index_to_character))

        if self.verbose:
            print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10]))
        if self.verbose:
            print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10]))

        if self.verbose:
            # Print sequences of length 1 in train set
            for token_sequence, label_sequence in zip(tokens['train'],
                                                      labels['train']):
                if len(label_sequence) == 1 and label_sequence[0] != 'O':
                    print("{0}\t{1}".format(token_sequence[0],
                                            label_sequence[0]))

        # Map tokens and labels to their indices
        token_indices = {}
        label_indices = {}
        character_indices = {}
        character_indices_padded = {}
        for dataset_type in dataset_filepaths.keys():
            token_indices[dataset_type] = []
            characters[dataset_type] = []
            character_indices[dataset_type] = []
            token_lengths[dataset_type] = []
            character_indices_padded[dataset_type] = []
            for token_sequence in tokens[dataset_type]:
                token_indices[dataset_type].append(
                    [token_to_index[token] for token in token_sequence])
                characters[dataset_type].append(
                    [list(token) for token in token_sequence])
                character_indices[dataset_type].append(
                    [[character_to_index[character] for character in token]
                     for token in token_sequence])
                token_lengths[dataset_type].append(
                    [len(token) for token in token_sequence])

                longest_token_length_in_sequence = max(
                    token_lengths[dataset_type][-1])
                character_indices_padded[dataset_type].append([
                    utils.pad_list(temp_token_indices,
                                   longest_token_length_in_sequence,
                                   self.PADDING_CHARACTER_INDEX) for
                    temp_token_indices in character_indices[dataset_type][-1]
                ])

            label_indices[dataset_type] = []
            for label_sequence in labels[dataset_type]:
                label_indices[dataset_type].append(
                    [label_to_index[label] for label in label_sequence])

        if self.verbose:
            print('token_lengths[\'train\'][0][0:10]: {0}'.format(
                token_lengths['train'][0][0:10]))
        if self.verbose:
            print('characters[\'train\'][0][0:10]: {0}'.format(
                characters['train'][0][0:10]))
        if self.verbose:
            print('token_indices[\'train\'][0:10]: {0}'.format(
                token_indices['train'][0:10]))
        if self.verbose:
            print('label_indices[\'train\'][0:10]: {0}'.format(
                label_indices['train'][0:10]))
        if self.verbose:
            print('character_indices[\'train\'][0][0:10]: {0}'.format(
                character_indices['train'][0][0:10]))
        if self.verbose:
            print('character_indices_padded[\'train\'][0][0:10]: {0}'.format(
                character_indices_padded['train'][0][0:10]))

        # Vectorize the labels
        # [Numpy 1-hot array](http://stackoverflow.com/a/42263603/395857)
        label_binarizer = sklearn.preprocessing.LabelBinarizer()
        label_binarizer.fit(range(max(index_to_label.keys()) + 1))
        label_vector_indices = {}
        for dataset_type in dataset_filepaths.keys():
            label_vector_indices[dataset_type] = []
            for label_indices_sequence in label_indices[dataset_type]:
                label_vector_indices[dataset_type].append(
                    label_binarizer.transform(label_indices_sequence))

        if self.verbose:
            print('label_vector_indices[\'train\'][0:2]: {0}'.format(
                label_vector_indices['train'][0:2]))

        if self.verbose:
            print('len(label_vector_indices[\'train\']): {0}'.format(
                len(label_vector_indices['train'])))
        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.token_indices = token_indices
        self.label_indices = label_indices
        self.character_indices_padded = character_indices_padded
        self.index_to_character = index_to_character
        self.character_to_index = character_to_index
        self.character_indices = character_indices
        self.token_lengths = token_lengths
        self.characters = characters
        self.tokens = tokens
        self.labels = labels
        self.label_vector_indices = label_vector_indices
        self.index_to_label = index_to_label
        self.label_to_index = label_to_index
        if self.verbose:
            print("len(self.token_to_index): {0}".format(
                len(self.token_to_index)))
        if self.verbose:
            print("len(self.index_to_token): {0}".format(
                len(self.index_to_token)))

        self.number_of_classes = max(self.index_to_label.keys()) + 1
        self.vocabulary_size = max(self.index_to_token.keys()) + 1
        self.alphabet_size = max(self.index_to_character.keys()) + 1
        if self.verbose:
            print("self.number_of_classes: {0}".format(self.number_of_classes))
        if self.verbose:
            print("self.alphabet_size: {0}".format(self.alphabet_size))
        if self.verbose:
            print("self.vocabulary_size: {0}".format(self.vocabulary_size))

        # unique_labels_of_interest is used to compute F1-scores.
        self.unique_labels_of_interest = list(self.unique_labels)
        self.unique_labels_of_interest.remove('O')

        self.unique_label_indices_of_interest = []
        for lab in self.unique_labels_of_interest:
            self.unique_label_indices_of_interest.append(label_to_index[lab])

        self.infrequent_token_indices = infrequent_token_indices

        if self.verbose:
            print('self.unique_labels_of_interest: {0}'.format(
                self.unique_labels_of_interest))
        if self.verbose:
            print('self.unique_label_indices_of_interest: {0}'.format(
                self.unique_label_indices_of_interest))

        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))
Esempio n. 12
0
    def load_dataset(self, dataset_filepaths, parameters):
        '''
        args:
        dataset_filepaths : dictionary with keys 'train', 'valid', 'test'
        http://stackoverflow.com/questions/27416164/what-is-conll-data-format
        '''
        start_time = time.time()
        print('Load dataset... ', end='', flush=True)
        all_pretrained_tokens = None
        if parameters['token_pretrained_embedding_filepath'] != '':
            all_pretrained_tokens = utils_nlp.load_tokens_from_pretrained_token_embeddings(
                parameters)
        if self.verbose:
            print("len(all_pretrained_tokens): {0}".format(
                len(all_pretrained_tokens)))

        remap_to_unk_count_threshold = 1
        #if ['train'] not in dataset_filepaths.keys(): raise ValueError('')
        self.UNK_TOKEN_INDEX = 0
        self.PADDING_CHARACTER_INDEX = 0
        self.tokens_mapped_to_unk = []
        self.UNK = 'UNK'
        self.unique_labels = []
        labels = {}
        tokens = {}
        characters = {}
        token_lengths = {}
        label_count = {}
        token_count = {}
        character_count = {}
        for dataset_type in ['train', 'valid', 'test']:
            labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], \
                character_count[dataset_type] = self._parse_dataset(dataset_filepaths[dataset_type],dataset_type)#,all_pretrained_tokens,token_count)
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        token_count['all'] = {}  # utils.merge_dictionaries()
        for token in list(token_count['train'].keys()) + list(
                token_count['valid'].keys()) + list(
                    token_count['test'].keys()):
            token_count['all'][
                token] = token_count['train'][token] + token_count['valid'][
                    token] + token_count['test'][token]

        for dataset_type in ['train', 'valid', 'test']:
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose:
                print("len(token_count[dataset_type]): {0}".format(
                    len(token_count[dataset_type])))

        character_count['all'] = {}  # utils.merge_dictionaries()
        for character in list(character_count['train'].keys()) + list(
                character_count['valid'].keys()) + list(
                    character_count['test'].keys()):
            character_count['all'][character] = character_count['train'][
                character] + character_count['valid'][
                    character] + character_count['test'][character]

        label_count['all'] = {}  # utils.merge_dictionaries()
        for character in list(label_count['train'].keys()) + list(
                label_count['valid'].keys()) + list(
                    label_count['test'].keys()):
            label_count['all'][
                character] = label_count['train'][character] + label_count[
                    'valid'][character] + label_count['test'][character]

        token_count['all'] = utils.order_dictionary(token_count['all'],
                                                    'value',
                                                    reverse=True)
        #label_count['train'] = utils.order_dictionary(label_count['train'], 'key', reverse = False)
        label_count['all'] = utils.order_dictionary(label_count['all'],
                                                    'key',
                                                    reverse=False)
        label_count['train'] = utils.order_dictionary(label_count['train'],
                                                      'key',
                                                      reverse=False)
        character_count['all'] = utils.order_dictionary(character_count['all'],
                                                        'value',
                                                        reverse=True)
        if self.verbose:
            print('character_count[\'all\']: {0}'.format(
                character_count['all']))

        token_to_index = {}
        token_to_index[self.UNK] = self.UNK_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0
        for token, count in token_count['all'].items():
            if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1

            if parameters['remove_unknown_tokens'] == 1 and \
                token_count['train'][token] == 0 and \
                (all_pretrained_tokens == None or \
                token not in all_pretrained_tokens and \
                token.lower() not in all_pretrained_tokens and \
                re.sub('\d', '0', token.lower()) not in all_pretrained_tokens):

                token_to_index[token] = self.UNK_TOKEN_INDEX
                number_of_unknown_tokens += 1
                self.tokens_mapped_to_unk.append(token)
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1
        if self.verbose:
            print("number_of_unknown_tokens: {0}".format(
                number_of_unknown_tokens))
        #         0/0

        infrequent_token_indices = []
        for token, count in token_count['train'].items():
            if 0 < count <= remap_to_unk_count_threshold:
                infrequent_token_indices.append(token_to_index[token])
        if self.verbose:
            print("len(token_count['train']): {0}".format(
                len(token_count['train'])))
        if self.verbose:
            print("len(infrequent_token_indices): {0}".format(
                len(infrequent_token_indices)))

        label_to_index = {}
        iteration_number = 0
        #for label, count in label_count['train'].items():
        for label, count in label_count['all'].items():
            label_to_index[label] = iteration_number
            iteration_number += 1
            self.unique_labels.append(label)

        #for label, count in label_count['train'].items():
        #    self.unique_labels.append(label)

        if self.verbose:
            print('self.unique_labels: {0}'.format(self.unique_labels))

        character_to_index = {}
        iteration_number = 0
        for character, count in character_count['all'].items():
            if iteration_number == self.PADDING_CHARACTER_INDEX:
                iteration_number += 1
            character_to_index[character] = iteration_number
            iteration_number += 1

        if self.verbose:
            print('token_count[\'train\'][0:10]: {0}'.format(
                list(token_count['train'].items())[0:10]))
        token_to_index = utils.order_dictionary(token_to_index,
                                                'value',
                                                reverse=False)
        index_to_token = utils.reverse_dictionary(token_to_index)
        if parameters['remove_unknown_tokens'] == 1:
            index_to_token[self.UNK_TOKEN_INDEX] = self.UNK
        label_to_index = utils.order_dictionary(label_to_index,
                                                'value',
                                                reverse=False)
        if self.verbose: print('label_to_index: {0}'.format(label_to_index))
        index_to_label = utils.reverse_dictionary(label_to_index)
        if self.verbose: print('index_to_label: {0}'.format(index_to_label))

        index_to_character = utils.reverse_dictionary(character_to_index)
        if self.verbose:
            print('character_to_index: {0}'.format(character_to_index))
        if self.verbose:
            print('index_to_character: {0}'.format(index_to_character))

        if self.verbose:
            print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10]))
        if self.verbose:
            print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10]))

        # Map tokens and labels to their indices
        token_indices = {}
        label_indices = {}
        character_indices = {}
        character_indices_padded = {}
        for dataset_type in ['train', 'valid', 'test']:
            token_indices[dataset_type] = []
            characters[dataset_type] = []
            character_indices[dataset_type] = []
            token_lengths[dataset_type] = []
            character_indices_padded[dataset_type] = []
            for token_sequence in tokens[dataset_type]:
                token_indices[dataset_type].append(
                    [token_to_index[token] for token in token_sequence])
                characters[dataset_type].append(
                    [list(token) for token in token_sequence])
                character_indices[dataset_type].append(
                    [[character_to_index[character] for character in token]
                     for token in token_sequence])
                token_lengths[dataset_type].append(
                    [len(token) for token in token_sequence])

                longest_token_length_in_sequence = max(
                    token_lengths[dataset_type][-1])
                character_indices_padded[dataset_type].append([
                    utils.pad_list(temp_token_indices,
                                   longest_token_length_in_sequence,
                                   self.PADDING_CHARACTER_INDEX) for
                    temp_token_indices in character_indices[dataset_type][-1]
                ])

            label_indices[dataset_type] = []
            for label_sequence in labels[dataset_type]:
                label_indices[dataset_type].append(
                    [label_to_index[label] for label in label_sequence])

        if self.verbose:
            print('token_lengths[\'train\'][0][0:10]: {0}'.format(
                token_lengths['train'][0][0:10]))
        if self.verbose:
            print('characters[\'train\'][0][0:10]: {0}'.format(
                characters['train'][0][0:10]))
        if self.verbose:
            print('token_indices[\'train\'][0:10]: {0}'.format(
                token_indices['train'][0:10]))
        if self.verbose:
            print('label_indices[\'train\'][0:10]: {0}'.format(
                label_indices['train'][0:10]))
        if self.verbose:
            print('character_indices[\'train\'][0][0:10]: {0}'.format(
                character_indices['train'][0][0:10]))
        if self.verbose:
            print('character_indices_padded[\'train\'][0][0:10]: {0}'.format(
                character_indices_padded['train'][0][0:10]))

        #  Vectorize the labels
        # [Numpy 1-hot array](http://stackoverflow.com/a/42263603/395857)
        label_binarizer = sklearn.preprocessing.LabelBinarizer()
        label_binarizer.fit(range(max(index_to_label.keys()) + 1))
        label_vector_indices = {}
        for dataset_type in ['train', 'valid', 'test']:
            label_vector_indices[dataset_type] = []
            for label_indices_sequence in label_indices[dataset_type]:
                label_vector_indices[dataset_type].append(
                    label_binarizer.transform(label_indices_sequence))

        if self.verbose:
            print('label_vector_indices[\'train\'][0:2]: {0}'.format(
                label_vector_indices['train'][0:2]))

        if self.verbose:
            print('len(label_vector_indices[\'train\']): {0}'.format(
                len(label_vector_indices['train'])))
        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.token_indices = token_indices
        self.label_indices = label_indices
        self.character_indices_padded = character_indices_padded
        self.index_to_character = index_to_character
        self.character_to_index = character_to_index
        self.character_indices = character_indices
        self.token_lengths = token_lengths
        self.characters = characters
        self.tokens = tokens
        self.labels = labels
        self.label_vector_indices = label_vector_indices
        self.index_to_label = index_to_label
        self.label_to_index = label_to_index
        if self.verbose:
            print("len(self.token_to_index): {0}".format(
                len(self.token_to_index)))
        if self.verbose:
            print("len(self.index_to_token): {0}".format(
                len(self.index_to_token)))

        self.number_of_classes = max(self.index_to_label.keys()) + 1
        self.vocabulary_size = max(self.index_to_token.keys()) + 1
        self.alphabet_size = max(self.index_to_character.keys()) + 1
        if self.verbose:
            print("self.number_of_classes: {0}".format(self.number_of_classes))
        if self.verbose:
            print("self.alphabet_size: {0}".format(self.alphabet_size))
        if self.verbose:
            print("self.vocabulary_size: {0}".format(self.vocabulary_size))

        # unique_labels_of_interest is used to compute F1-scores.
        self.unique_labels_of_interest = list(self.unique_labels)
        self.unique_labels_of_interest.remove('O')

        self.unique_label_indices_of_interest = []
        for lab in self.unique_labels_of_interest:
            self.unique_label_indices_of_interest.append(label_to_index[lab])

        self.infrequent_token_indices = infrequent_token_indices

        if self.verbose:
            print('self.unique_labels_of_interest: {0}'.format(
                self.unique_labels_of_interest))
        if self.verbose:
            print('self.unique_label_indices_of_interest: {0}'.format(
                self.unique_label_indices_of_interest))

        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))