Esempio n. 1
0
    def __init__(self,
                 data,
                 opt,
                 mode='train',
                 image_features=None,
                 fixed_answers_entry=None):
        self.opt = opt
        self.data = []
        self.mode = mode
        assert mode in ['train', 'dev', 'test']
        error_samples = []
        for datum in data:
            if len(datum['annotated_question']['word']) == 0:
                error_samples.append(datum['question_id'])
                continue
            if mode != 'test' and len(datum['orign_answers']) == 0:
                error_samples.append(datum['question_id'])
                continue
            self.data.append(datum)
        # global self.opt
        log.info('Remove {} samples for empty question or answers: {}'.format(
            len(error_samples), error_samples))
        self.opt = opt
        self.set_dataset()
        if 'DEBUG' in self.opt:
            self.debug = True
        else:
            self.debug = False
        self.debug_dataset()
        self.img_features_cache = {}
        self.image_features = image_features
        self.fixed_answers_entry = fixed_answers_entry

        if 'ES_ocr' in self.opt:
            self.ocr_name_list = [self.opt['ES_ocr']] + self.ocr_name_list
            self.es_ocr_len = int(self.opt['ES_ocr_len'])
            self.es_sort_way = self.opt['ES_sort_way']
        log.info('Using OCR from: {}'.format(self.ocr_name_list))
        log.info('Using OD from: {}'.format(self.od_name_list))

        if 'BERT' in self.opt:
            if 'BERT_LARGE' in self.opt:
                log.debug('Using BERT Large model')
                tokenizer_file = os.path.join(
                    self.opt['datadir'], self.opt['BERT_large_tokenizer_file'])
                log.debug('Loading tokenizer from {}'.format(tokenizer_file))
                self.bert_tokenizer = BertTokenizer.from_pretrained(
                    tokenizer_file)
            else:
                log.debug('Using BERT base model')
                tokenizer_file = os.path.join(self.opt['datadir'],
                                              self.opt['BERT_tokenizer_file'])
                log.debug('Loading tokenizer from {}'.format(tokenizer_file))
                self.bert_tokenizer = BertTokenizer.from_pretrained(
                    tokenizer_file)
Esempio n. 2
0
    def __init__(self, opt, data, use_cuda, vocab, char_vocab, evaluation=False):
        # file_name = os.path.join(self.spacyDir, 'coqa-' + dataset_label + '-preprocessed.json')

        self.data = data
        self.use_cuda = use_cuda 
        self.vocab = vocab
        self.char_vocab = char_vocab
        self.evaluation = evaluation
        self.opt = opt
        if 'PREV_ANS' in self.opt:
            self.prev_ans = self.opt['PREV_ANS']
        else:
            self.prev_ans = 2

        if 'PREV_QUES' in self.opt:
            self.prev_ques = self.opt['PREV_QUES']
        else:
            self.prev_ques = 0

        self.use_char_cnn = 'CHAR_CNN' in self.opt

        self.bert_tokenizer = None
        if 'BERT' in self.opt:
            if 'BERT_LARGE' in opt:
                print('Using BERT Large model')
                tokenizer_file = os.path.join(opt['datadir'], opt['BERT_large_tokenizer_file'])
                print('Loading tokenizer from', tokenizer_file)
                self.bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_file)
            else:
                print('Using BERT base model')
                tokenizer_file = os.path.join(opt['datadir'], opt['BERT_tokenizer_file'])
                print('Loading tokenizer from', tokenizer_file)
                self.bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_file)

        self.answer_span_in_context = 'ANSWER_SPAN_IN_CONTEXT_FEATURE' in self.opt

        self.ques_max_len = (30 + 1) * self.prev_ans + (25 + 1) * (self.prev_ques + 1)
        self.char_max_len = 30

        print('*****************')
        print('prev_ques   :', self.prev_ques)
        print('prev_ans    :', self.prev_ans)
        print('ques_max_len:', self.ques_max_len)
        print('*****************')

        c2id = {c: i for i, c in enumerate(char_vocab)}
        
        # random shuffle for training
        if not evaluation:
            indices = list(range(len(self.data)))
            random.shuffle(indices)
            self.data = [self.data[i] for i in indices]
Esempio n. 3
0
    def __init__(self, opt, data, use_cuda, vocab, char_vocab, train_img_id2idx, train_img_features, train_img_spatials, val_img_id2idx, val_img_features, val_img_spatials, mod='train'):
        # file_name = os.path.join(self.spacyDir, 'coqa-' + dataset_label + '-preprocessed.json')

        self.data = data
        self.use_cuda = use_cuda 
        self.vocab = vocab
        self.char_vocab = char_vocab
        self.train_img_features = train_img_features
        self.train_img_id2idx = train_img_id2idx
        self.train_img_spatials = train_img_spatials
        self.val_img_features = val_img_features
        self.val_img_spatials = val_img_spatials
        self.val_img_id2idx = val_img_id2idx
        self.img_feature_dim = opt['img_fea_dim']
        self.img_spatial_dim = opt['img_spa_dim']
        self.img_fea_num = opt['img_fea_num']
        self.use_img_feature = 'img_feature' in opt
        if 'ModelParallel' in opt:
            self.bert_cuda = 'cuda:{}'.format(opt['ModelParallel'][-1])
            self.main_cuda = 'cuda:{}'.format(opt['ModelParallel'][0])

        self.ocr_name_list = opt['ocr_name_list'].split(',')
        if 'ES_ocr' in opt:
            self.ocr_name_list = [opt['ES_ocr']] + self.ocr_name_list
            self.es_ocr_len = int(opt['ES_ocr_len'])
            self.es_sort_way = opt['ES_sort_way']
        else:
            self.es_ocr_len = None
        error_ocr_name = []
        for ocr_name in self.ocr_name_list:
            if ocr_name not in self.data[0]:
                error_ocr_name.append(ocr_name)
        if len(error_ocr_name) != 0:
            log.error('OCR name ERROR: ' + str(error_ocr_name))
            assert False
        else:
            log.info('Using OCR from: ' + str(self.ocr_name_list))
        self.mod = mod
        # if mod == 'train':
        #     self.evaluation = False
        # else:
        #     self.evaluation = True
        self.opt = opt
        if 'PREV_ANS' in self.opt:
            self.prev_ans = self.opt['PREV_ANS']
        else:
            self.prev_ans = 2

        if 'PREV_QUES' in self.opt:
            self.prev_ques = self.opt['PREV_QUES']
        else:
            self.prev_ques = 0

        self.use_char_cnn = 'CHAR_CNN' in self.opt

        self.bert_tokenizer = None
        if 'BERT' in self.opt:
            if 'BERT_LARGE' in opt:
                print('Using BERT Large model')
                tokenizer_file = os.path.join(opt['datadir'], opt['BERT_large_tokenizer_file'])
                print('Loading tokenizer from', tokenizer_file)
                self.bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_file)
            else:
                # print('Using BERT base model')
                tokenizer_file = os.path.join(opt['datadir'], opt['BERT_tokenizer_file'])
                # print('Loading tokenizer from', tokenizer_file)
                self.bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_file)

        self.answer_span_in_context = 'ANSWER_SPAN_IN_CONTEXT_FEATURE' in self.opt

        self.ques_max_len = (30 + 1) * self.prev_ans + (25 + 1) * (self.prev_ques + 1)
        self.char_max_len = 30

        # print('*****************')
        # print('prev_ques   :', self.prev_ques)
        # print('prev_ans    :', self.prev_ans)
        # print('ques_max_len:', self.ques_max_len)
        # print('*****************')

        c2id = {c: i for i, c in enumerate(char_vocab)}
        self.od_name_list = opt['od_name_list'].split(',')
        error_od_name = []
        for od_name in self.od_name_list:
            if od_name not in self.data[0]:
                error_od_name.append(od_name)
        if len(error_od_name) != 0:
            log.error('OD name ERROR: ' + error_ocr_name)
            assert False
        else:
            log.info('Using OD from: ' + str(self.od_name_list))