Beispiel #1
0
    def show_training_data(self):
        train = self.train
        dev = self.dev
        self.log('### Loaded data')
        self.log('# train: {} ... {}\n'.format(train.inputs[0][0], train.inputs[0][-1]))
        self.log('# train_gold: {} ... {}\n'.format(train.outputs[0][0], train.outputs[0][-1]))
        t2i_tmp = list(self.dic.tables[constants.UNIGRAM].str2id.items())
        self.log('# token2id: {} ... {}\n'.format(t2i_tmp[:10], t2i_tmp[len(t2i_tmp)-10:]))
        if self.dic.has_table(constants.BIGRAM):
            b2i_tmp = list(self.dic.tables[constants.BIGRAM].str2id.items())
            self.log('# bigram2id: {} ... {}\n'.format(b2i_tmp[:10], b2i_tmp[len(b2i_tmp)-10:]))
        if self.dic.has_trie(constants.CHUNK):
            id2chunk = self.dic.tries[constants.CHUNK].id2chunk
            n_chunks = len(self.dic.tries[constants.CHUNK])
            c2i_head = [(id2chunk[i], i) for i in range(0, min(10, n_chunks))]
            c2i_tail = [(id2chunk[i], i) for i in range(max(0, n_chunks-10), n_chunks)]
            self.log('# chunk2id: {} ... {}\n'.format(c2i_head, c2i_tail))
        if self.dic.has_table(constants.SEG_LABEL):
            id2seg = {v:k for k,v in self.dic.tables[constants.SEG_LABEL].str2id.items()}
            self.log('# label_set: {}\n'.format(id2seg))

        attr_indexes=common.get_attribute_values(self.args.attr_indexes)
        for i in range(len(attr_indexes)):
            if self.dic.has_table(constants.ATTR_LABEL(i)):
                id2attr = {v:k for k,v in self.dic.tables[constants.ATTR_LABEL(i)].str2id.items()}
                self.log('# {}-th attribute labels: {}\n'.format(i, id2attr))
        
        self.report('[INFO] vocab: {}'.format(len(self.dic.tables[constants.UNIGRAM])))
        self.report('[INFO] data length: train={} devel={}'.format(
            len(train.inputs[0]), len(dev.inputs[0]) if dev else 0))
Beispiel #2
0
    def show_training_data(self):
        train = self.train
        dev = self.dev
        self.log('### Loaded data')
        self.log('# train: {} ... {}\n'.format(train.inputs[0][0],
                                               train.inputs[0][-1]))
        self.log('# train_gold_attr: {} ... {}\n'.format(
            train.outputs[0][0], train.outputs[0][-1]))
        t2i_tmp = list(self.dic.tables[constants.UNIGRAM].str2id.items())
        self.log('# token2id: {} ... {}\n'.format(t2i_tmp[:10],
                                                  t2i_tmp[len(t2i_tmp) - 10:]))

        attr_indexes = common.get_attribute_values(self.args.attr_indexes)
        for i in range(len(attr_indexes)):
            if self.dic.has_table(constants.ATTR_LABEL(i)):
                id2attr = {
                    v: k
                    for k, v in self.dic.tables[constants.ATTR_LABEL(
                        i)].str2id.items()
                }
                self.log('# {}-th attribute labels: {}\n'.format(i, id2attr))

        self.report('[INFO] vocab: {}'.format(
            len(self.dic.tables[constants.UNIGRAM])))
        self.report('[INFO] data length: train={} devel={}'.format(
            len(train.inputs[0]),
            len(dev.inputs[0]) if dev else 0))
    def parse_commandline_input(self, line, dic):
        attr_delim = self.attr_delim if self.attr_delim else constants.SL_ATTR_DELIM
        get_unigram_id = dic.tables[constants.UNIGRAM].get_id
        if constants.ATTR_LABEL(1) in dic.tables:
            use_attr1 = True
            get_attr1_id = dic.tables[constants.ATTR_LABEL(1)].get_id
        else:
            use_attr1 = False
            get_attr1_id = None

        org_arr = line.split(' ')
        if use_attr1:
            org_attr1_seq = [
                self.preprocess_attribute(
                    elem.split(attr_delim)[1] if attr_delim in elem else constants.UNK_SYMBOL,
                    0, #self.attr_depths[0], 
                    None, #self.attr_target_labelsets[0]
                ) for elem in org_arr]
            org_attr1_seqs = [org_attr1_seq]
            attr1_seq = [get_attr1_id(attr) for attr in org_attr1_seq]
            attr1_seqs = [attr1_seq]
        else:
            org_attr1_seqs = []
            attr1_seqs = []

        org_token_seq = [elem.split(attr_delim)[0] for elem in org_arr]
        org_token_seqs = [org_token_seq]
        ptoken_seq = [self.preprocess_token(word) for word in org_token_seq]
        uni_seq = [get_unigram_id(word) for word in ptoken_seq]
        uni_seqs = [uni_seq]

        inputs = [uni_seqs, None, attr1_seqs] # TODO fix
        outputs = []
        orgdata = [org_token_seqs, org_attr1_seqs]
        return RestorableData(inputs, outputs, orgdata=orgdata)
Beispiel #4
0
 def load_dic(self, dic_path):
     with open(dic_path, 'rb') as f:
         self.dic = pickle.load(f)
     self.log('Load dic: {}'.format(dic_path))
     self.log('Num of tokens: {}'.format(
         len(self.dic.tables[constants.UNIGRAM])))
     if self.dic.has_table(constants.BIGRAM):
         self.log('Num of bigrams: {}'.format(
             len(self.dic.tables[constants.BIGRAM])))
     if self.dic.has_trie(constants.CHUNK):
         self.log('Num of chunks: {}'.format(
             len(self.dic.tries[constants.CHUNK])))
     if self.dic.has_table(constants.SEG_LABEL):
         self.log('Num of segmentation labels: {}'.format(
             len(self.dic.tables[constants.SEG_LABEL])))
     for i in range(3):  # tmp
         if self.dic.has_table(constants.ATTR_LABEL(i)):
             self.log('Num of {}-th attribute labels: {}'.format(
                 i, len(self.dic.tables[constants.ATTR_LABEL(i)])))
     if self.dic.has_table(constants.ARC_LABEL):
         self.log('Num of arc labels: {}'.format(
             len(self.dic.tables[constants.ARC_LABEL])))
     if self.dic.has_table(constants_sematt.SEM_LABEL):
         self.log('Num of sem labels: {}'.format(
             len(self.dic.tables[constants_sematt.SEM_LABEL])))
     self.log('')
    def load_decode_data_SL(self, path, dic):
        attr_delim = self.attr_delim if self.attr_delim else constants.SL_ATTR_DELIM
        get_unigram_id = dic.tables[constants.UNIGRAM].get_id
        if constants.ATTR_LABEL(1) in dic.tables:
            use_attr1 = True
            get_attr1_id = dic.tables[constants.ATTR_LABEL(1)].get_id
        else:
            use_attr1 = False
            get_attr1_id = None

        org_token_seqs = []
        org_attr1_seqs = []
        token_seqs = []
        attr1_seqs = []

        ins_cnt = 0
        with open(path) as f:
            for line in f:
                line = self.normalize_input_line(line)
                if len(line) <= 1:
                    continue
     
                elif line[0] == constants.COMMENT_SYM:
                    continue
                
                org_arr = line.split(constants.SL_TOKEN_DELIM)
                if use_attr1:
                    org_attr1_seq = [
                        self.preprocess_attribute(
                            elem.split(attr_delim)[1],
                            self.attr_depths[0], 
                            self.attr_target_labelsets[0])
                        for elem in org_arr]
                    org_attr1_seqs.append(org_attr1_seq)
                    attr1_seq = [get_attr1_id(attr) for attr in org_attr1_seq]
                    attr1_seqs.append(attr1_seq)

                org_token_seq = [elem.split(attr_delim)[0] for elem in org_arr]
                org_token_seqs.append(org_token_seq)
                ptoken_seq = [self.preprocess_token(token) for token in org_token_seq]
                token_seq = [get_unigram_id(ptoken, update=ptoken in self.unigram_vocab) for ptoken in ptoken_seq]
                token_seqs.append(token_seq)
                    
                ins_cnt += 1
                if ins_cnt % constants.NUM_FOR_REPORTING == 0:
                    print('Read', ins_cnt, 'sentences', file=sys.stderr)
                    
        inputs = [token_seqs]
        inputs.append(None) # bigram
        inputs.append(attr1_seqs if attr1_seqs else None)
        outputs = []
        orgdata = [org_token_seqs, org_attr1_seqs]

        return RestorableData(inputs, outputs, orgdata=orgdata)
    def setup_classifier(self):
        dic = self.dic
        hparams = self.hparams

        n_vocab = len(dic.tables['unigram'])
        unigram_embed_dim = hparams['unigram_embed_dim']

        if 'pretrained_unigram_embed_dim' in hparams and hparams['pretrained_unigram_embed_dim'] > 0:
            pretrained_unigram_embed_dim = hparams['pretrained_unigram_embed_dim']
        else:
            pretrained_unigram_embed_dim = 0

        if 'pretrained_embed_usage' in hparams:
            pretrained_embed_usage = models.util.ModelUsage.get_instance(hparams['pretrained_embed_usage'])
        else:
            pretrained_embed_usage = models.util.ModelUsage.NONE

        n_attr1 = len(dic.tables[constants.ATTR_LABEL(0)]) if (
            hparams['attr1_embed_dim'] > 0 and constants.ATTR_LABEL(0) in dic.tables) else 0
        n_labels = len(dic.tables[constants.ARC_LABEL]) if common.is_typed_parsing_task(self.task) else 0
        attr1_embed_dim = hparams['attr1_embed_dim'] if n_attr1 > 0 else 0

        if (pretrained_embed_usage == models.util.ModelUsage.ADD or
            pretrained_embed_usage == models.util.ModelUsage.INIT):
            if pretrained_unigram_embed_dim > 0 and pretrained_unigram_embed_dim != unigram_embed_dim:
                print('Error: pre-trained and random initialized unigram embedding vectors '
                      + 'must be the same dimension for {} operation'.format(hparams['pretrained_embed_usage'])
                      + ': d1={}, d2={}'.format(pretrained_unigram_embed_dim, unigram_embed_dim),
                      file=sys.stderr)
                sys.exit()

        predictor = models.parser.RNNBiaffineParser(
            n_vocab, unigram_embed_dim, n_attr1, attr1_embed_dim,
            hparams['rnn_unit_type'], hparams['rnn_bidirection'], hparams['rnn_n_layers'], 
            hparams['rnn_n_units'], 
            hparams['mlp4arcrep_n_layers'], hparams['mlp4arcrep_n_units'],
            hparams['mlp4labelrep_n_layers'], hparams['mlp4labelrep_n_units'],
            mlp4labelpred_n_layers=hparams['mlp4labelpred_n_layers'], 
            mlp4labelpred_n_units=hparams['mlp4labelpred_n_units'],
            n_labels=n_labels, rnn_dropout=hparams['rnn_dropout'], 
            hidden_mlp_dropout=hparams['hidden_mlp_dropout'], 
            pred_layers_dropout=hparams['pred_layers_dropout'],
            pretrained_unigram_embed_dim=pretrained_unigram_embed_dim,
            pretrained_embed_usage=pretrained_embed_usage)

        self.classifier = classifiers.dependency_parser.DependencyParser(predictor)
    def load_decode_data_SL(self, path, dic):
        attr_delim = self.attr_delim if self.attr_delim else constants.SL_ATTR_DELIM
        num_attrs = len(self.attr_indexes)
        word_clm = self.token_index

        get_unigram_id = dic.tables[constants.UNIGRAM].get_id
        get_attr_id = dic.tables[constants.ATTR_LABEL(0)].get_id if num_attrs > 0 else None
        root_token = constants.ROOT_SYMBOL

        org_token_seqs = []
        org_attr_seqs = []      # second or later attribute is ignored
        token_seqs = []
        attr_seqs = []

        ins_cnt = 0
        with open(path) as f:
            for line in f:
                line = self.normalize_input_line(line)
                if len(line) <= 1:
                    continue
     
                elif line[0] == constants.COMMENT_SYM:
                    continue
                
                org_arr = line.split(constants.SL_TOKEN_DELIM)

                org_token_seq = [elem.split(attr_delim)[word_clm] for elem in org_arr]
                org_token_seq.insert(0, root_token)
                org_token_seqs.append(org_token_seq)
                ptoken_seq = [self.preprocess_token(token) for token in org_token_seq]
                token_seq = [get_unigram_id(ptoken, update=ptoken in self.unigram_vocab) for ptoken in ptoken_seq]
                token_seqs.append(token_seq)
 
                if num_attrs > 0:
                    org_token_seq = [elem.split(attr_delim)[0] for elem in org_arr]
                    org_attr_seq = [
                        self.preprocess_attribute(
                            elem.split(attr_delim)[self.attr_indexes[0]], 
                            self.attr_depths[0], self.attr_target_labelsets[0])
                        for elem in org_arr]
                    org_attr_seq.insert(0, root_token)
                    org_attr_seqs.append(org_attr_seq)
                    attr_seq = [get_attr_id(attr) for attr in org_attr_seq]
                    attr_seqs.append(attr_seq)

                ins_cnt += 1
                if ins_cnt % constants.NUM_FOR_REPORTING == 0:
                    print('Read', ins_cnt, 'sentences', file=sys.stderr)
     
        inputs = [token_seqs, None]
        outputs = []
        outputs.append(attr_seqs if num_attrs > 0 else None)
        orgdata = [org_token_seqs]
        orgdata.append(org_attr_seqs if num_attrs > 0 else None)

        return RestorableData(inputs, outputs, orgdata=orgdata)
Beispiel #8
0
    def run_epoch(self, data, train=True):
        classifier = self.classifier
        evaluator = self.evaluator

        inputs = self.gen_inputs(data)
        xs = inputs[0]
        n_sen = len(xs)

        golds = inputs[self.label_begin_index]
        if train:
            self.classifier.train(*inputs)
        ret = self.classifier.decode(*inputs[:self.label_begin_index])
        counts = self.evaluator.calculate(*[xs], *[golds], *[ret])

        if train:
            self.log('\n<training result>')
            res = evaluator.report_results(n_sen, counts, file=self.logger)
            self.report('train\t%s' % res)

            if self.args.devel_data:
                self.log('\n<development result>')
                v_res = self.run_epoch(self.dev, train=False)
                self.report('devel\t%s' % v_res)

            # save model
            if not self.args.quiet:
                mdl_path = '{}/{}.pkl'.format(constants.MODEL_DIR,
                                              self.start_time)
                with open(mdl_path, 'wb') as f:
                    pickle.dump(self.classifier, f)

                mdl_path_txt = '{}/{}.txt'.format(constants.MODEL_DIR,
                                                  self.start_time)
                self.classifier.predictor.dump_model_as_txt(
                    mdl_path_txt,
                    self.dic.tables[constants_sematt.SEM_LABEL].id2str,
                    self.dic.tables[constants.UNIGRAM].id2str,
                    self.dic.tables[constants.ATTR_LABEL(0)].id2str)

                self.log('Save the model (binary): %s' % mdl_path)
                self.log('Save the model (text): %s' % mdl_path_txt)
                self.report('[INFO] Save the model (binary): %s\n' % mdl_path)
                self.report('[INFO] Save the model (text1): %s\n' %
                            mdl_path_txt)

            if not self.args.quiet:
                self.reporter.close()
                self.reporter = open(
                    '{}/{}.log'.format(constants.LOG_DIR, self.start_time),
                    'a')

        res = None if train else evaluator.report_results(
            n_sen, counts, file=self.logger)
        return res
Beispiel #9
0
    def grow_embedding_layers(self, dic_grown, external_model=None, train=True):
        id2unigram_grown = dic_grown.tables[constants.UNIGRAM].id2str
        n_vocab_org = self.predictor.unigram_embed.W.shape[0]
        n_vocab_grown = len(id2unigram_grown)
        if (self.predictor.pretrained_embed_usage == models.util.ModelUsage.ADD or
            self.predictor.pretrained_embed_usage == models.util.ModelUsage.CONCAT):
            pretrained_unigram_embed = self.predictor.pretrained_unigram_embed
        else:
            pretrained_unigram_embed = None
        models.util.grow_embedding_layers(
            n_vocab_org, n_vocab_grown, self.predictor.unigram_embed, 
            pretrained_unigram_embed, external_model, id2unigram_grown,
            self.predictor.pretrained_embed_usage, train=train)

        if constants.ATTR_LABEL(0) in dic_grown.tables: # POS
            id2pos_grown = dic_grown.tables[constants.ATTR_LABEL(0)].id2str
            n_pos_org = self.predictor.pos_embed.W.shape[0]
            n_pos_grown = len(id2pos_grown)
            models.util.grow_embedding_layers(
                n_pos_org, n_pos_grown, self.predictor.pos_embed, train=train)
Beispiel #10
0
    def load_model(self):
        model_path = self.args.model_path
        if model_path.endswith('.pkl'):
            model_format = 'pkl'
            array = model_path.split('.pkl')
        elif model_path.endswith('.txt'):
            model_format = 'txt'
            array = model_path.split('.txt')
        else:
            print(
                'Error: invalid model format. The file name must ends with \'pkl\' or \'txt\'.',
                file=sys.stderr)
            sys.exit()

        dic_path = '{}.s2i'.format(array[0])
        hparam_path = '{}.hyp'.format(array[0])
        param_path = model_path

        # dictionary
        self.load_dic(dic_path)

        # hyper parameters
        self.load_hyperparameters(hparam_path)
        self.log('Load hyperparameters: {}\n'.format(hparam_path))
        self.show_hyperparameters()

        # model
        if model_format == 'pkl':
            with open(model_path, 'rb') as f:
                self.classifier = pickle.load(f)

        elif model_format == 'txt':
            predictor = models.attribute_annotator.load_model_from_txt(
                model_path, self.dic.tables[constants_sematt.SEM_LABEL].str2id,
                self.dic.tables[constants.UNIGRAM].str2id,
                (self.dic.tables[constants.ATTR_LABEL(0)].str2id
                 if self.dic.has_table(constants.ATTR_LABEL(0)) else None))
            self.classifier = classifiers.pattern_matcher.PatternMatcher(
                predictor)

        self.log('Load model: {}\n'.format(model_path))
Beispiel #11
0
    def decode_batch(self, *inputs, org_tokens=None, org_attrs=None, file=sys.stdout):
        ys = self.classifier.decode(*inputs)
        id2label = (self.dic.tables[constants.SEG_LABEL if common.is_segmentation_task(self.task)
                                    else constants.ATTR_LABEL(0)].id2str)

        # for i in range(len(inputs[0])):
        #     print(len(inputs[0][i]), inputs[0][i])
        #     print(len(org_tokens[i]), org_tokens[i])
        #     print(len(ys[i]), ys[i])
        #     print()

        if not org_attrs:
            org_attrs = [None] * len(org_tokens)


        for x_str, a_str, y in zip(org_tokens, org_attrs, ys):
            y_str = [id2label[int(yi)] for yi in y]
            y_str = self.convert_to_valid_BIES_seq(y_str)

            if self.task == constants.TASK_TAG:
                if a_str:
                    res = ['{}{}{}{}{}'.format(xi_str, self.args.output_attr_delim,
                                                 ai_str, self.args.output_attr_delim,
                                                 yi_str)
                           for xi_str, ai_str, yi_str in zip(x_str, a_str, y_str)]
                else:
                    res = ['{}{}{}'.format(xi_str, self.args.output_attr_delim, yi_str)
                           for xi_str, yi_str in zip(x_str, y_str)]
                
                if self.args.output_data_format == 'wl':
                    res.append('')
                res = self.args.output_token_delim.join(res)


            elif self.task == constants.TASK_SEG:
                res = ['{}{}'.format(xi_str, self.args.output_token_delim 
                                     if (yi_str.startswith('E') or yi_str.startswith('S')) 
                                     else '') for xi_str, yi_str in zip(x_str, y_str)]
                res = ''.join(res).rstrip(' ')

            elif self.task == constants.TASK_SEGTAG:
                res = ['{}{}'.format(
                    xi_str, 
                    (self.args.output_attr_delim+yi_str[2:]+self.args.output_token_delim) 
                    if (yi_str.startswith('E-') or yi_str.startswith('S-')) else ''
                ) for xi_str, yi_str in zip(x_str, y_str)]
                res = ''.join(res).rstrip(' ')

            else:
                print('Error: Invalid decode type', file=self.logger)
                sys.exit()

            print(res, file=file)
def init_dictionary(num_attrs=0): 
    dic = dictionary.Dictionary()

    # unigram
    dic.create_table(constants.UNIGRAM)
    dic.tables[constants.UNIGRAM].set_unk(constants.UNK_SYMBOL)

    # attributes
    for i in range(num_attrs):
        dic.create_table(constants.ATTR_LABEL(i))
        # dic.tables[constants.ATTR_LABEL(i)].set_unk(constants.UNK_SYMBOL)

    return dic
Beispiel #13
0
    def grow_inference_layers(self, dic_grown):
        n_labels_org = self.predictor.mlp.layers[-1].W.shape[0]
        if common.is_segmentation_task(self.task):
            n_labels_grown = len(dic_grown.tables[constants.SEG_LABEL].id2str)
        else:
            n_labels_grown = len(
                dic_grown.tables[constants.ATTR_LABEL(0)].id2str)

        models.util.grow_MLP(n_labels_org, n_labels_grown,
                             self.predictor.mlp.layers[-1])
        if self.predictor.use_crf:
            models.util.grow_crf_layer(n_labels_org, n_labels_grown,
                                       self.predictor.crf)
Beispiel #14
0
    def parse_commandline_input(self, line, dic):
        attr_delim = self.attr_delim if self.attr_delim else constants.SL_ATTR_DELIM
        num_attrs = len(self.attr_indexes)

        get_unigram_id = dic.tables[constants.UNIGRAM].get_id
        if constants.ATTR_LABEL(0) in dic.tables:
            use_attr0 = True
            get_attr0_id = dic.tables[constants.ATTR_LABEL(0)].get_id
        else:
            use_attr0 = False
            get_attr0_id = None

        org_arr = line.split(' ')
        if use_attr0:
            attr0_seq = [
                elem.split(attr_delim)[self.attr_indexes[0]] 
                if attr_delim in elem else ''
                for elem in org_arr]
            org_attr0_seq = [
                self.preprocess_attribute(attr, self.attr_depths[0], self.attr_target_labelsets[0]) 
                for attr in attr0_seq]
            org_attr0_seqs = [org_attr0_seq]
            attr0_seq = [get_attr0_id(attr) for attr in org_attr0_seq]
            attr0_seqs = [attr0_seq]
        else:
            org_attr0_seqs = []
            attr0_seqs = []

        org_token_seq = [elem.split(attr_delim)[0] for elem in org_arr]
        org_token_seqs = [org_token_seq]
        ptoken_seq = [self.preprocess_token(word) for word in org_token_seq]
        uni_seq = [get_unigram_id(word) for word in ptoken_seq]
        uni_seqs = [uni_seq]

        inputs = [uni_seqs]
        outputs = [attr0_seqs]
        orgdata = [org_token_seqs, org_attr0_seqs]

        return RestorableData(inputs, outputs, orgdata=orgdata)
Beispiel #15
0
    def setup_evaluator(self, evaluator=None):
        if self.task == constants.TASK_TAG and self.args.ignored_labels: # TODO fix
            ignored_labels = set()
            for label in self.args.ignored_labels.split(','):
                label_id = self.dic.tables[constants.ATTR_LABEL(0)].get_id(label)
                if label_id >= 0:
                    ignored_labels.add(label_id)

            self.log('Setup evaluator: labels to be ignored={}\n'.format(ignored_labels))

        else:
            ignored_labels = set()

        # TODO reflect ignored_labels
        evaluator1 = None
        if self.task == constants.TASK_SEG:
            if self.args.evaluation_method == 'normal':
                evaluator1 = FMeasureEvaluator(self.dic.tables[constants.SEG_LABEL].id2str)
            elif self.args.evaluation_method == 'each_length':
                evaluator1 = FMeasureEvaluatorForEachLength(self.dic.tables[constants.SEG_LABEL].id2str)
            elif self.args.evaluation_method == 'each_vocab':
                vocabs = self.gen_vocabs()
                evaluator1 = FMeasureEvaluatorForEachVocab(self.dic.tables[constants.SEG_LABEL].id2str, vocabs)

        elif self.task == constants.TASK_SEGTAG:
            evaluator1 = DoubleFMeasureEvaluator(self.dic.tables[constants.SEG_LABEL].id2str)

        elif self.task == constants.TASK_TAG:
            if common.use_fmeasure(self.dic.tables[constants.ATTR_LABEL(0)].str2id):
                evaluator1 = FMeasureEvaluator(self.dic.tables[constants.ATTR_LABEL(0)].id2str)
            else:
                evaluator1 = AccuracyEvaluator(self.dic.tables[constants.ATTR_LABEL(0)].id2str)
                evaluator1.calculator.id2token = self.dic.tables[constants.UNIGRAM].id2str # tmp

        if not evaluator:
            self.evaluator = evaluator1
        else:
            evaluator = evaluator1
Beispiel #16
0
    def setup_evaluator(self, evaluator=None):
        ignored_labels = set()
        if self.args.ignored_labels:
            for label in self.args.ignored_labels.split(','):
                label_id = self.dic.tables[constants.ATTR_LABEL(0)].get_id(
                    label)
                if label_id >= 0:
                    ignored_labels.add(label_id)
            self.args.ignored_labels = ignored_labels

        self.log('Setup evaluator: labels to be ignored={}\n'.format(
            ignored_labels))
        self.evaluator = AccuracyEvaluator(ignore_head=False,
                                           ignored_labels=ignored_labels)
def init_dictionary(
        num_attrs=0,
        use_arc_label=False):

    dic = dictionary.Dictionary()

    # unigram
    dic.create_table(constants.UNIGRAM)
    dic.tables[constants.UNIGRAM].set_unk(constants.UNK_SYMBOL)
    dic.tables[constants.UNIGRAM].get_id(constants.ROOT_SYMBOL, update=True)

    # attributes
    for i in range(num_attrs):
        dic.create_table(constants.ATTR_LABEL(i))
        # dic.tables[constants.ATTR_LABEL(i)].set_unk(constants.UNK_SYMBOL)

    # arc label
    if use_arc_label:
        dic.create_table(constants.ARC_LABEL)

    return dic
    def load_gold_data_WL(self, path, dic, train=True):
        attr_delim = self.attr_delim if self.attr_delim else constants.WL_ATTR_DELIM
        num_attrs = len(self.attr_indexes)
        word_clm = self.token_index

        if not dic:
            dic = init_dictionary(num_attrs=num_attrs)

        get_unigram_id = dic.tables[constants.UNIGRAM].get_id
        get_ith_attr_id = []
        for i in range(num_attrs): 
            get_ith_attr_id.append(dic.tables[constants.ATTR_LABEL(i)].get_id)

        token_seqs = []
        attr_seqs_list = [[] for i in range(num_attrs)]

        ins_cnt = 0
        with open(path) as f:
            uni_seq = [] 
            attr_seq_list = [[] for i in range(num_attrs)]
     
            for line in f:
                line = self.normalize_input_line(line)
                if len(line) == 0:
                    if len(uni_seq) > 0:
                        token_seqs.append(uni_seq)
                        uni_seq = []
                        for i, attr_seq in enumerate(attr_seq_list):
                            if self.attr_chunking_flags[i]:
                                attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in 
                                            data_loader.get_labelseq_BIOES(attr_seq)]
                            attr_seqs_list[i].append(attr_seq)
                            attr_seq_list = [[] for i in range(num_attrs)]

                        ins_cnt += 1
                        if ins_cnt % constants.NUM_FOR_REPORTING == 0:
                            print('Read', ins_cnt, 'sentences', file=sys.stderr)

                    continue

                elif line[0] == constants.COMMENT_SYM:
                    continue

                array = line.split(attr_delim)
                token = self.preprocess_token(array[word_clm])
                tlen = len(token)
                attrs = [None] * max(num_attrs, 1)
     
                for i in range(num_attrs):
                    attr = array[self.attr_indexes[i]] if len(array) > self.attr_indexes[i] else ''
                    attrs[i] = self.preprocess_attribute(
                        attr, self.attr_depths[i], self.attr_target_labelsets[i])

                    update_token = self.to_be_registered(token, train, self.freq_tokens, self.unigram_vocab)
                    uni_seq.append(get_unigram_id(token, update=update_token))
 
                    for i in range(num_attrs):
                        attr = attrs[i]
                        attr_tmp = attr if self.attr_chunking_flags[i] else get_ith_attr_id[i](attr, update=train)
                        attr_seq_list[i].append(attr_tmp)

            # register last sentenece
            if len(uni_seq) > 0:
                token_seqs.append(uni_seq)
                for i, attr_seq in enumerate(attr_seq_list):
                    if self.attr_chunking_flags[i]:
                        attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in 
                                    data_loader.get_labelseq_BIOES(attr_seq)]
                    attr_seqs_list[i].append(attr_seq)

        inputs = [token_seqs]
        inputs.append(None) # bigram
        inputs.append(attr_seqs_list[1] if len(attr_seqs_list) > 1 else None)
     
        outputs = []
        if len(attr_seqs_list) > 0:
            outputs.append(attr_seqs_list[0])
            
        return Data(inputs, outputs), dic
    def load_decode_data_WL(self, path, dic):
        attr_delim = self.attr_delim if self.attr_delim else constants.WL_ATTR_DELIM
        num_attrs = len(self.attr_indexes)
        word_clm = self.token_index

        get_unigram_id = dic.tables[constants.UNIGRAM].get_id
        get_attr_id = dic.tables[constants.ATTR_LABEL(0)].get_id if num_attrs > 0 else None
        root_token = constants.ROOT_SYMBOL

        org_token_seqs = []
        org_attr_seqs = []
        token_seqs = []
        attr_seqs = []

        ins_cnt = 0

        with open(path) as f:
            org_token_seq = [root_token]
            org_attr_seq = [root_token]
            token_seq = [get_unigram_id(root_token)]
            attr_seq_list = []

            for line in f:
                line = self.normalize_input_line(line)
                if len(line) == 0:
                    if len(token_seq) > 0:
                        org_token_seqs.append(org_token_seq)
                        org_token_seq = [root_token]

                        token_seqs.append(token_seq)
                        token_seq = [get_unigram_id(root_token)]

                        if num_attrs > 0:
                            if self.attr_chunking_flags[0]:
                                org_attr_seq = [attr for attr in data_loader.get_labelseq_BIOES(attr_seq)]
                            org_attr_seqs.append(org_attr_seq)

                            attr_seq = [get_attr_id(attr) for attr in org_attr_seq]
                            attr_seqs.append(attr_seq)
                            
                            org_attr_seq = [root_token]
                            attr_seq = [get_attr_id(root_token)]

                        ins_cnt += 1
                        if ins_cnt % constants.NUM_FOR_REPORTING == 0:
                            print('Read', ins_cnt, 'sentences', file=sys.stderr)

                    continue

                elif line[0] == constants.COMMENT_SYM:
                    continue

                array = line.split(attr_delim)
                org_token = array[word_clm]
                org_token_seq.append(org_token)

                ptoken = self.preprocess_token(org_token)
                token_seq.append(get_unigram_id(ptoken, update=ptoken in self.unigram_vocab))

                attrs = [None] * max(num_attrs, 1)
                if num_attrs > 0:
                    attr = self.preprocess_attribute(
                        array[self.attr_indexes[0]], self.attr_depths[0], self.attr_target_labelsets[0])
                    org_attr_seq.append(attr)

            # register last sentenece
            if len(token_seq) > 1: # initialized sequence contains ROOT 
                org_token_seqs.append(org_token_seq)
                token_seqs.append(token_seq)

                if num_attrs > 0:
                    if self.attr_chunking_flags[0]:
                        org_attr_seq = [attr for attr in data_loader.get_labelseq_BIOES(attr_seq)]
                    org_attr_seqs.append(org_attr_seq)

                    attr_seq = [get_attr_id(attr) for attr in org_attr_seq]
                    attr_seqs.append(attr_seq)

        inputs = [token_seqs, None]
        outputs = []
        outputs.append(attr_seqs if num_attrs > 0 else None)
        orgdata = [org_token_seqs]
        orgdata.append(org_attr_seqs if num_attrs > 0 else None)

        return RestorableData(inputs, outputs, orgdata=orgdata)
    def load_gold_data_WL(self, path, dic, train=True):
        attr_delim = self.attr_delim if self.attr_delim else constants.WL_ATTR_DELIM
        num_attrs = len(self.attr_indexes)
        word_clm = self.token_index
        head_clm = self.head_index
        arc_clm = self.arc_index

        if not dic:
            dic = init_dictionary(
                num_attrs=num_attrs,
                use_arc_label=self.use_arc_label)

        get_unigram_id = dic.tables[constants.UNIGRAM].get_id
        get_arc_id = dic.tables[constants.ARC_LABEL].get_id if self.use_arc_label else None
        get_ith_attr_id = []
        for i in range(num_attrs): 
            get_ith_attr_id.append(dic.tables[constants.ATTR_LABEL(i)].get_id)

        token_seqs = []
        head_seqs = []          # list of head id sequences
        arc_seqs = []           # list of arc label sequences
        attr_seqs_list = [[] for i in range(num_attrs)]

        ins_cnt = 0
        sen_len_th = 3          # ROOT + more than two tokens

        with open(path) as f:
            uni_seq = [get_unigram_id(constants.ROOT_SYMBOL)] 
            head_seq = [constants.NO_PARENTS_ID]
            arc_seq = [constants.NO_PARENTS_ID] if self.use_arc_label else None
            attr_seq_list = [[get_ith_attr_id[i](constants.ROOT_SYMBOL, update=train)] for i in range(num_attrs)]
     
            for line in f:
                line = self.normalize_input_line(line)
                if len(line) == 0:
                    if len(uni_seq) >= sen_len_th: 
                        token_seqs.append(uni_seq)
                        uni_seq = [get_unigram_id(constants.ROOT_SYMBOL)]

                        head_seqs.append(head_seq)
                        head_seq = [constants.NO_PARENTS_ID]
                        
                        if self.use_arc_label:
                            arc_seqs.append(arc_seq)
                            arc_seq = [constants.NO_PARENTS_ID]

                        for i, attr_seq in enumerate(attr_seq_list):
                            if self.attr_chunking_flags[i]:
                                # TODO fix code for ROOT
                                attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in 
                                            data_loader.get_labelseq_BIOES(attr_seq)]
                            attr_seqs_list[i].append(attr_seq)
                        attr_seq_list = [[get_ith_attr_id[i](constants.ROOT_SYMBOL)] for i in range(num_attrs)]

                        ins_cnt += 1
                        if ins_cnt % constants.NUM_FOR_REPORTING == 0:
                            print('Read', ins_cnt, 'sentences', file=sys.stderr)

                    continue

                elif line[0] == constants.COMMENT_SYM:
                    continue

                array = line.split(attr_delim)
                token = self.preprocess_token(array[word_clm])
                attrs = [None] * max(num_attrs, 1)
     
                for i in range(num_attrs):
                    org_attr = array[self.attr_indexes[i]] if self.attr_indexes[i] < len(array) else constants.UNK_SYMBOL
                    attrs[i] = self.preprocess_attribute(
                        org_attr, self.attr_depths[i], self.attr_target_labelsets[i])
                    attr_tmp = attrs[i] if self.attr_chunking_flags[i] else get_ith_attr_id[i](
                        attrs[i], update=train)
                    attr_seq_list[i].append(attr_tmp)

                update_token = self.to_be_registered(token, train, self.freq_tokens, self.unigram_vocab)
                uni_seq.append(get_unigram_id(token, update=update_token))

                head = int(array[head_clm])
                if head < 0:
                    head = 0
                head_seq.append(head)
     
                if self.use_arc_label:
                    arc = array[arc_clm]
                    arc_seq.append(get_arc_id(arc, update=train))

            # register last sentenece
            if len(uni_seq) >= sen_len_th: 
                # org_token_seqs.append(org_token_seq)
                token_seqs.append(uni_seq)
                head_seqs.append(head_seq)
                arc_seqs.append(arc_seq)
                for i, attr_seq in enumerate(attr_seq_list):
                    if self.attr_chunking_flags[i]:
                        attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in 
                                    data_loader.get_labelseq_BIOES(attr_seq)]
                    attr_seqs_list[i].append(attr_seq)

        inputs = [token_seqs]
        inputs.append(attr_seqs_list[1] if len(attr_seqs_list) > 1 else None)
        print(len(inputs[0]))
     
        outputs = []
        outputs.append(attr_seqs_list[0] if len(attr_seqs_list) > 0 else None)
        outputs.append(head_seqs)
        if self.use_arc_label:
            outputs.append(arc_seqs)

        return Data(inputs, outputs), dic
Beispiel #21
0
    def setup_classifier(self):
        dic = self.dic
        hparams = self.hparams

        n_vocab = len(dic.tables['unigram'])
        unigram_embed_dim = hparams['unigram_embed_dim']
        
        if 'bigram_embed_dim' in hparams and hparams['bigram_embed_dim'] > 0:
            bigram_embed_dim = hparams['bigram_embed_dim']
            n_bigrams = len(dic.tables[constants.BIGRAM])
        else:
            bigram_embed_dim = n_bigrams = 0

        if 'pretrained_unigram_embed_dim' in hparams and hparams['pretrained_unigram_embed_dim'] > 0:
            pretrained_unigram_embed_dim = hparams['pretrained_unigram_embed_dim']
        else:
            pretrained_unigram_embed_dim = 0

        if 'pretrained_bigram_embed_dim' in hparams and hparams['pretrained_bigram_embed_dim'] > 0:
            pretrained_bigram_embed_dim = hparams['pretrained_bigram_embed_dim']
        else:
            pretrained_bigram_embed_dim = 0

        if 'pretrained_embed_usage' in hparams:
            pretrained_embed_usage = models.util.ModelUsage.get_instance(hparams['pretrained_embed_usage'])
        else:
            pretrained_embed_usage = models.util.ModelUsage.NONE

        if common.is_segmentation_task(self.task):
            n_label = len(dic.tables[constants.SEG_LABEL])
            n_labels = [n_label]
            attr1_embed_dim = n_attr1 = 0

        else:
            n_labels = []
            for i in range(3): # tmp
                if constants.ATTR_LABEL(i) in dic.tables:
                    n_label = len(dic.tables[constants.ATTR_LABEL(i)])
                    n_labels.append(n_label)
                
            if 'attr1_embed_dim' in hparams and hparams['attr1_embed_dim'] > 0:
                attr1_embed_dim = hparams['attr1_embed_dim']
                n_attr1 = n_labels[1] if len(n_labels) > 1 else 0
            else:
                attr1_embed_dim = n_attr1 = 0

        if (pretrained_embed_usage == models.util.ModelUsage.ADD or
            pretrained_embed_usage == models.util.ModelUsage.INIT):
            if pretrained_unigram_embed_dim > 0 and pretrained_unigram_embed_dim != unigram_embed_dim:
                print('Error: pre-trained and random initialized unigram embedding vectors '
                      + 'must be the same dimension for {} operation'.format(hparams['pretrained_embed_usage'])
                      + ': d1={}, d2={}'.format(pretrained_unigram_embed_dim, unigram_embed_dim),
                      file=sys.stderr)
                sys.exit()

            if pretrained_bigram_embed_dim > 0 and pretrained_bigram_embed_dim != bigram_embed_dim:
                print('Error: pre-trained and random initialized bigram embedding vectors '
                      + 'must be the same dimension for {} operation'.format(hparams['pretrained_embed_usage'])
                      + ': d1={}, d2={}'.format(pretrained_bigram_embed_dim, bigram_embed_dim),
                      file=sys.stderr)
                sys.exit()

        predictor = models.tagger.construct_RNNTagger(
            n_vocab, unigram_embed_dim, n_bigrams, bigram_embed_dim,
            n_attr1, attr1_embed_dim, 0, 0,
            hparams['rnn_unit_type'], hparams['rnn_bidirection'], 
            hparams['rnn_n_layers'], hparams['rnn_n_units'], 
            hparams['rnn_n_layers2'] if 'rnn_n_layers2' in hparams else 0,
            hparams['rnn_n_units2'] if 'rnn_n_units2' in hparams else 0,
            hparams['mlp_n_layers'], hparams['mlp_n_units'], n_labels[0], 
            use_crf=hparams['inference_layer'] == 'crf',
            feat_dim=hparams['additional_feat_dim'], mlp_n_additional_units=0,
            rnn_dropout=hparams['rnn_dropout'],
            embed_dropout=hparams['embed_dropout'] if 'embed_dropout' in hparams else 0.0,
            mlp_dropout=hparams['mlp_dropout'],
            pretrained_unigram_embed_dim=pretrained_unigram_embed_dim,
            pretrained_bigram_embed_dim=pretrained_bigram_embed_dim,
            pretrained_embed_usage=pretrained_embed_usage)

        self.classifier = classifiers.sequence_tagger.SequenceTagger(predictor, task=self.task)
Beispiel #22
0
    def load_gold_data_WL(self, path, dic, train=True):
        attr_delim = self.attr_delim if self.attr_delim else constants.WL_ATTR_DELIM
        num_attrs = len(self.attr_indexes)

        if not dic:
            dic = init_dictionary(num_attrs=num_attrs)

        get_unigram_id = dic.tables[constants.UNIGRAM].get_id
        get_label_id = dic.tables[constants_sematt.SEM_LABEL].get_id
        get_ith_attr_id = []
        for i in range(num_attrs): 
            get_ith_attr_id.append(dic.tables[constants.ATTR_LABEL(i)].get_id)

        token_seqs = []
        label_seqs = []          # list of semantic attribute sequences
        attr_seqs_list = [[] for i in range(num_attrs)]

        ins_cnt = 0
        word_clm = self.token_index
        label_clm = self.label_index

        with open(path) as f:
            uni_seq = [] 
            label_seq = []
            attr_seq_list = [[] for i in range(num_attrs)]
     
            for line in f:
                line = self.normalize_input_line(line)
                if len(line) == 0:
                    if len(uni_seq) > 0:
                        token_seqs.append(uni_seq)
                        uni_seq = []
                        label_seqs.append(label_seq)
                        label_seq = []
                        
                        for i, attr_seq in enumerate(attr_seq_list):
                            if self.attr_chunking_flags[i]:
                                attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in 
                                            data_loader.get_labelseq_BIOES(attr_seq)]
                            attr_seqs_list[i].append(attr_seq)
                            attr_seq_list = [[] for i in range(num_attrs)]

                        ins_cnt += 1
                        if ins_cnt % constants.NUM_FOR_REPORTING == 0:
                            print('Read', ins_cnt, 'sentences', file=sys.stderr)

                    continue

                elif line[0] == constants.COMMENT_SYM:
                    continue

                array = line.split(attr_delim)
                token = self.preprocess_token(array[word_clm])
                tlen = len(token)
                attrs = [None] * max(num_attrs, 1)
     
                if len(array) < 2 + num_attrs:
                    continue

                for i in range(num_attrs):
                    attrs[i] = self.preprocess_attribute(
                        array[self.attr_indexes[i]], self.attr_depths[i], self.attr_target_labelsets[i])
                    attr_tmp = attrs[i] if self.attr_chunking_flags[i] else get_ith_attr_id[i](
                        attrs[i], update=train)
                    attr_seq_list[i].append(attr_tmp)

                update_token = self.to_be_registered(token, train)
                uni_seq.append(get_unigram_id(token, update=update_token))

                label = array[label_clm] if len(array) > label_clm else constants.NONE_SYMBOL
                if label == '':
                    label = constants.NONE_SYMBOL
                if DELIM in label:
                    labels = label.split(DELIM)
                    label = labels[0]
                if COLON in label: # ':' is used as a special character when reading/writing a txt-format model
                    label = label.replace(COLON, COLON_ALT)

                label_seq.append(get_label_id(label, update=train))


            # register last sentenece
            if len(uni_seq) > 0:
                token_seqs.append(uni_seq)
                label_seqs.append(label_seq)
                for i, attr_seq in enumerate(attr_seq_list):
                    if self.attr_chunking_flags[i]:
                        attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in 
                                    data_loader.get_labelseq_BIOES(attr_seq)]
                    attr_seqs_list[i].append(attr_seq)

        inputs = [token_seqs]
        inputs.append(attr_seqs_list[0] if len(attr_seqs_list) > 0 else None)
        outputs = [label_seqs]

        return Data(inputs, outputs), dic