Example #1
0
    def count_data_and_build_dict(self, data_list, gene_dicts=True):
        def add_ept_and_unk(a_list):
            a_list.insert(0, '@@@empty')
            a_list.insert(1, '@@@unk')
            return a_list

        _logger.add()
        _logger.add('counting and build dictionaries')

        token_collection = []
        char_collection = []

        sent_len_collection = []
        token_len_collection = []

        for sample in data_list:
            for tree_node in sample:
                token_collection += tree_node['token_seq']
                sent_len_collection.append(len(tree_node['token_seq']))
                for char_seq in tree_node['char_seq']:
                    char_collection += char_seq
                    token_len_collection.append(len(char_seq))

        max_sent_len = dynamic_length(sent_len_collection, 1, security=False)[0]
        max_token_len = dynamic_length(token_len_collection, 0.99, security=False)[0]

        if gene_dicts:
            # token & char
            tokenSet = dynamic_keep(token_collection, 1)
            charSet = dynamic_keep(char_collection, 1)
            if cfg.use_glove_unk_token:
                gloveData = load_glove(cfg.word_embedding_length)
                gloveTokenSet = list(gloveData.keys())
                if cfg.lower_word:
                    tokenSet = list(set([token.lower() for token in tokenSet]))  ##!!!
                    gloveTokenSet = list(set([token.lower() for token in gloveTokenSet]))  ##!!!

                # delete token from gloveTokenSet which appears in tokenSet
                for token in tokenSet:
                    try:
                        gloveTokenSet.remove(token)
                    except ValueError:
                        pass
            else:
                if cfg.lower_word:
                    tokenSet = list(set([token.lower() for token in tokenSet]))
                gloveTokenSet = []
            tokenSet = add_ept_and_unk(tokenSet)
            charSet = add_ept_and_unk(charSet)
            dicts = {'token': tokenSet, 'char': charSet, 'glove': gloveTokenSet}
        else:
            dicts = {}

        _logger.done()
        return dicts, {'sent': max_sent_len, 'token': max_token_len}
Example #2
0
    def count_data_and_build_dict(dataset, sent_len_rate, gene_dicts=True):
        def add_ept_and_unk(a_list):
            a_list.insert(0, '@@@empty')
            a_list.insert(1, '@@@unk')
            return a_list

        _logger.add()
        _logger.add('counting and build dictionaries')

        token_collection = []
        sent_num_collection = []
        sent_len_collection = []
        question_len_collection = []

        for topic in dataset:
            for paragraph in topic['paragraphs']:
                sent_num_collection.append(len(paragraph['context_token']))
                for sent_token in paragraph['context_token']:
                    sent_len_collection.append(len(sent_token))
                    token_collection += sent_token
                for qa in paragraph['qas']:
                    question_len_collection.append(len(qa['question_token']))
                    token_collection += qa['question_token']

        _logger.done()

        max_sent_num, _ = dynamic_length(sent_num_collection, 1.)
        max_sent_len, _ = dynamic_length(sent_len_collection, sent_len_rate)
        max_question_len, _ = dynamic_length(question_len_collection, 0.995)

        if gene_dicts:
            tokenSet = dynamic_keep(token_collection, 0.995)
            if cfg.use_glove_unk_token:
                gloveData = load_glove(cfg.word_embedding_length)
                gloveTokenSet = list(gloveData.keys())
                if cfg.lower_word:
                    tokenSet = list(set([token.lower()
                                         for token in tokenSet]))  ##!!!
                    gloveTokenSet = list(
                        set([token.lower() for token in gloveTokenSet]))  ##!!!

                # delete token from gloveTokenSet which appears in tokenSet
                for token in tokenSet:
                    try:
                        gloveTokenSet.remove(token)
                    except ValueError:
                        pass
            else:
                if cfg.lower_word:
                    tokenSet = list(set([token.lower() for token in tokenSet]))
                gloveTokenSet = []
            tokenSet = add_ept_and_unk(tokenSet)
            dicts = {'token': tokenSet, 'glove': gloveTokenSet}
        else:
            dicts = {}
        _logger.done()
        return dicts, {
            'sent_num': max_sent_num,
            'sent_len': max_sent_len,
            'question': max_question_len
        }