def __init__(self,
                 vocab_path,
                 label_map_config=None,
                 max_seq_len=512,
                 do_lower_case=True,
                 in_tokens=False,
                 is_inference=False,
                 learning_strategy='pointwise',
                 random_seed=None,
                 tokenizer="FullTokenizer",
                 phase='train',
                 is_classify=True,
                 is_regression=False,
                 for_cn=True,
                 task_id=0,
                 is_tsv=True):
        assert phase in ['train',
                         'predict'], "supported phase: train, predict."
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path, do_lower_case=do_lower_case)
        self.vocab = self.tokenizer.vocab
        self.pad_id = self.vocab["[PAD]"]
        self.cls_id = self.vocab["[CLS]"]
        self.sep_id = self.vocab["[SEP]"]
        self.mask_id = self.vocab["[MASK]"]
        self.in_tokens = in_tokens
        self.phase = phase
        self.is_inference = is_inference
        self.learning_strategy = learning_strategy
        self.for_cn = for_cn
        self.task_id = task_id
        self.is_tsv = is_tsv

        np.random.seed(random_seed)

        self.is_classify = is_classify
        self.is_regression = is_regression
        self.current_example = 0
        self.current_epoch = 0
        self.num_examples = 0
        self.examples = {}

        if label_map_config:
            with open(label_map_config, encoding='utf8') as f:
                self.label_map = json.load(f)
                if six.PY2:
                    self.label_map = unicode_convert(self.label_map)
        else:
            self.label_map = None
Esempio n. 2
0
    def __init__(self,
                 vocab_path,
                 label_map_config=None,
                 max_seq_len=512,
                 do_lower_case=True,
                 in_tokens=False,
                 random_seed=None,
                 tokenizer="FullTokenizer",
                 is_classify=True,
                 is_regression=False,
                 for_cn=True,
                 task_id=0,
                 doc_stride=128,
                 max_query_length=64,
                 remove_noanswer=True):
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path, do_lower_case=do_lower_case)
        self.vocab = self.tokenizer.vocab
        self.pad_id = self.vocab["[PAD]"]
        self.cls_id = self.vocab["[CLS]"]
        self.sep_id = self.vocab["[SEP]"]
        self.in_tokens = in_tokens
        self.for_cn = for_cn
        self.task_id = task_id
        self.doc_stride = doc_stride
        self.max_query_length = max_query_length
        self.examples = {}
        self.features = {}
        self.remove_noanswer = remove_noanswer

        if random_seed is not None:
            np.random.seed(random_seed)

        self.current_example = 0
        self.current_epoch = 0
        self.num_examples = 0

        self.Example = namedtuple('Example', [
            'qas_id', 'question_text', 'doc_tokens', 'orig_answer_text',
            'start_position', 'end_position'
        ])
        self.Feature = namedtuple("Feature", [
            "unique_id", "example_index", "doc_span_index", "tokens",
            "token_to_orig_map", "token_is_max_context", "token_ids",
            "position_ids", "text_type_ids", "start_position", "end_position"
        ])
        self.DocSpan = namedtuple("DocSpan", ["start", "length"])