Example #1
0
 def __init__(self, args):
     self.args = args
     if (args.bert_model == 'bert-base-multilingual-cased'):
         self.tokenizer = BertTokenizer.from_pretrained(
             'bert-base-multilingual-cased', do_lower_case=False)
     else:
         self.tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                                        do_lower_case=True)
         if (len(self.tokenizer.vocab) == 31748):
             f = open(args.bert_model + "/vocab.txt", "a")
             f.write(
                 "\n[unused1]\n[unused2]\n[unused3]\n[unused4]\n[unused5]\n[unused6]\n[unused7]"
             )
             f.close()
             self.tokenizer = BertTokenizer.from_pretrained(
                 args.bert_model, do_lower_case=True)
     self.sep_token = '[SEP]'
     self.cls_token = '[CLS]'
     self.pad_token = '[PAD]'
     self.tgt_bos = '[unused1]'
     self.tgt_eos = '[unused2]'
     self.tgt_sent_split = '[unused3]'
     self.sep_vid = self.tokenizer.vocab[self.sep_token]
     self.cls_vid = self.tokenizer.vocab[self.cls_token]
     self.pad_vid = self.tokenizer.vocab[self.pad_token]
Example #2
0
    def __init__(self, args):
        self.args = args
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.mask_token = '[MASK]'
        self.tgt_bos = '[unused0]'
        self.tgt_eos = '[unused1]'

        with open(args.src_dict_path) as f:
            line = f.read().strip()
            self.src_dict = json.loads(line)

        with open(args.tgt_dict_path) as f:
            line = f.read().strip()
            self.tgt_dict = json.loads(line)

        with open(args.relation_path) as f:
            line = f.read().strip()
        self.relation_dict = json.loads(line)

        self.sep_vid = self.src_dict[self.sep_token]
        self.cls_vid = self.src_dict[self.cls_token]
        self.pad_vid = self.src_dict[self.pad_token]
        self.unk_vid = self.src_dict[self.unk_token]
Example #3
0
 def __init__(self, args):
     self.args = args
     if args.cased:
         self.tokenizer = BertTokenizer.from_pretrained('BETO/')
     else:
         self.tokenizer = BertTokenizer.from_pretrained('BETO/',
                                                        do_lower_case=True)
     print(self.tokenizer)
     self.sep_token = '[SEP]'
     self.cls_token = '[CLS]'
     self.pad_token = '[PAD]'
     self.tgt_bos = '[unused0]'
     self.tgt_eos = '[unused1]'
     self.tgt_sent_split = '[unused2]'
     self.sep_vid = self.tokenizer.vocab[self.sep_token]
     self.cls_vid = self.tokenizer.vocab[self.cls_token]
     self.pad_vid = self.tokenizer.vocab[self.pad_token]
Example #4
0
def load_one_text_web(source, device):
    from others.tokenization import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    sep_vid = tokenizer.vocab['[SEP]']
    cls_vid = tokenizer.vocab['[CLS]']
    max_pos = 512

    def _process_src(raw):
        raw = raw.strip().lower()
        raw = raw.replace('[cls]', '[CLS]').replace('[sep]', '[SEP]')
        src_subtokens = tokenizer.tokenize(raw)
        src_subtokens = ['[CLS]'] + src_subtokens + ['[SEP]']
        src_subtoken_idxs = tokenizer.convert_tokens_to_ids(src_subtokens)
        src_subtoken_idxs = src_subtoken_idxs[:-1][:max_pos]
        src_subtoken_idxs[-1] = sep_vid
        _segs = [-1] + [
            i for i, t in enumerate(src_subtoken_idxs) if t == sep_vid
        ]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        segments_ids = []
        segs = segs[:max_pos]
        for i, s in enumerate(segs):
            if (i % 2 == 0):
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]

        src = torch.tensor(src_subtoken_idxs)[None, :].to(device)
        mask_src = (1 - (src == 0).float()).to(device)
        cls_ids = [[
            i for i, t in enumerate(src_subtoken_idxs) if t == cls_vid
        ]]
        clss = torch.tensor(cls_ids).to(device)
        mask_cls = 1 - (clss == -1).float()
        clss[clss == -1] = 0

        return src, mask_src, segments_ids, clss, mask_cls

    x = source
    src, mask_src, segments_ids, clss, mask_cls = _process_src(x)
    segs = torch.tensor(segments_ids)[None, :].to(device)
    batch = Batch()
    batch.src = src
    batch.tgt = None
    batch.mask_src = mask_src
    batch.mask_tgt = None
    batch.segs = segs
    batch.src_str = [[
        sent.replace('[SEP]', '').strip() for sent in x.split('[CLS]')
    ]]
    batch.tgt_str = ['']
    batch.clss = clss
    batch.mask_cls = mask_cls

    batch.batch_size = 1
    yield batch
Example #5
0
    def __init__(self, args):
        self.CHUNK_LIMIT = 512
        self.args = args

        if args.model_name == 'scibert':
            self.tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', do_lower_case=True)

        elif 'bert-base' in args.model_name or 'bert-large' in args.model_name:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.tgt_bos = '[unused0]'
        self.tgt_eos = '[unused1]'
        self.tgt_sent_split = '[unused2]'

        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]
Example #6
0
    def __init__(self, args):
        self.args = args
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.tgt_bos = '[unused0]'
        self.tgt_eos = '[unused1]'
        self.tgt_sent_split = '[unused2]'
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]
Example #7
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args,
                                        load_dataset(args,
                                                     'valid',
                                                     shuffle=False),
                                        args.batch_size,
                                        device,
                                        shuffle=False,
                                        is_test=False)

    #tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False, cache_dir=args.temp_dir)
    #tokenizer = BertTokenizer.from_pretrained('hubert-wiki', do_lower_case=False, cache_dir=None)
    #tokenizer = BertTokenizer.from_pretrained('hubert-web', do_lower_case=False, cache_dir=None)
    tokenizer = BertTokenizer.from_pretrained('libert-large',
                                              do_lower_case=False,
                                              cache_dir=None)

    symbols = {
        'BOS': tokenizer.vocab['[unused5]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }

    valid_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          train=False,
                          device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
Example #8
0
    def __init__(self, args):
        self.args = args
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                                       do_lower_case=True)

        self.sep_token = "[SEP]"
        self.cls_token = "[CLS]"
        self.pad_token = "[PAD]"
        self.tgt_bos = "[unused0]"
        self.tgt_eos = "[unused1]"
        self.tgt_sent_split = "[unused2]"
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]
Example #9
0
    def __init__(self, args):
        self.args = args
        self.tokenizer = BertTokenizer.from_pretrained(BERT_PATH,
                                                       do_lower_case=False)

        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.tgt_bos = '[unused1]'
        self.tgt_eos = '[unused2]'
        self.tgt_sent_split = '[unused3]'
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]
Example #10
0
    def __init__(self, args):
        self.args = args
        self.tokenizer = BertTokenizer.from_pretrained(
            '/home/ffajri/Data/Bert/indobert/indobert-vocab-presum.txt',
            do_lower_case=True)

        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.tgt_bos = '[BOS]'
        self.tgt_eos = '[EOS]'
        self.tgt_sent_split = '[QOS]'
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]
Example #11
0
    def __init__(self, args):
        self.args = args
        #self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
        #self.tokenizer = BertTokenizer.from_pretrained('hubert',do_lower_case=False)
        self.tokenizer = BertTokenizer.from_pretrained('libert-large',
                                                       do_lower_case=False)

        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.tgt_bos = '[unused5]'
        self.tgt_eos = '[unused1]'
        self.tgt_sent_split = '[unused2]'
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]
Example #12
0
    def __init__(self, args):
        self.args = args
        self.tokenizer = BertTokenizer.from_pretrained(args.bert_temp_dir)

        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.tgt_bos = '[unused1]'
        self.tgt_eos = '[unused2]'
        self.tgt_sent_split = '[unused3]'
        self.role_1 = '[unused4]'
        self.role_2 = '[unused5]'
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]
        self.unk_vid = self.tokenizer.vocab[self.unk_token]
 def __init__(self, abs_model_file):
     self.args = self._build_abs_args()
     # load model
     step_abs = int(abs_model_file.split('.')[-2].split('_')[-1])
     checkpoint = torch.load(abs_model_file, map_location=lambda storage, loc: storage)
     self.model_abs = model_bld.AbsSummarizer(self.args, self.args.device, checkpoint)
     self.model_abs.eval()
     # prepare tokenizer and predictor
     self.tokenizer = BertTokenizer.from_pretrained(path.join(self.args.bert_model_path, self.model_abs.bert.model_name), do_lower_case=True)
     self.symbols = {'BOS': self.tokenizer.vocab['[unused0]'], 'EOS': self.tokenizer.vocab['[unused1]'],
                'PAD': self.tokenizer.vocab['[PAD]'], 'EOQ': self.tokenizer.vocab['[unused2]']}
     self.predictor = pred_abs.build_predictor(self.args, self.tokenizer, self.symbols, self.model_abs, logger)
     # special tokens
     self.sep_token = '[SEP]'
     self.cls_token = '[CLS]'
     self.pad_token = '[PAD]'
     self.sep_vid = self.tokenizer.vocab[self.sep_token]
     self.cls_vid = self.tokenizer.vocab[self.cls_token]
     self.pad_vid = self.tokenizer.vocab[self.pad_token]
Example #14
0
    def __init__(self, min_src_ntokens_per_sent=5,
                max_src_ntokens_per_sent=200,
                max_src_nsents=max_src_nsents,
                min_src_nsents=1,
                max_tgt_ntokens=500,
                min_tgt_ntokens=5):
        self.min_src_ntokens_per_sent = min_src_ntokens_per_sent
        self.max_src_ntokens_per_sent = max_src_ntokens_per_sent
        self.max_src_nsents = max_src_nsents
        self.min_src_nsents = min_src_nsents
        self.max_tgt_ntokens = max_tgt_ntokens
        self.min_tgt_ntokens = min_tgt_ntokens
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.tgt_bos = '[unused0]'
        self.tgt_eos = '[unused1]'
        self.tgt_sent_split = '[unused2]'
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]
def load_text(args, source_fp, target_fp, device):
    from others.tokenization import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    sep_vid = tokenizer.vocab['[SEP]']
    cls_vid = tokenizer.vocab['[CLS]']
    n_lines = len(open(source_fp, encoding='UTF-8').read().split('\n'))

    def _process_src(raw):
        raw = raw.strip().lower()
        src_subtokens = tokenizer.tokenize(raw)
        src_subtokens = ['[CLS]'] + src_subtokens + ['[SEP]']

        src_subtokens_temp = []
        j = 0
        for i in range(len(src_subtokens) - 4):
            if i != j:
                continue
            if ("".join(src_subtokens[i:i + 4])) == '[##cl##s##]':
                src_subtokens_temp.append('[CLS]')
                j = i + 4
            elif ("".join(src_subtokens[i:i + 4])) == '[##se##p##]':
                src_subtokens_temp.append('[SEP]')
                j = i + 4
            else:
                src_subtokens_temp.append(src_subtokens[i])
                j = i + 1
        src_subtokens = src_subtokens_temp + src_subtokens[-3:]
        # print(src_subtokens)

        src_subtoken_idxs = tokenizer.convert_tokens_to_ids(src_subtokens)
        src_subtoken_idxs = src_subtoken_idxs[:-1][:args.max_pos]
        src_subtoken_idxs[-1] = sep_vid
        _segs = [-1] + [
            i for i, t in enumerate(src_subtoken_idxs) if t == sep_vid
        ]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        segments_ids = []
        segs = segs[:args.max_pos]
        for i, s in enumerate(segs):
            if (i % 2 == 0):
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]

        src = torch.tensor(src_subtoken_idxs)[None, :].to(device)
        mask_src = (1 - (src == 0).float()).to(device)
        cls_ids = [[
            i for i, t in enumerate(src_subtoken_idxs) if t == cls_vid
        ]]
        clss = torch.tensor(cls_ids).to(device)
        mask_cls = 1 - (clss == -1).float()
        clss[clss == -1] = 0

        return src, mask_src, segments_ids, clss, mask_cls

    if (target_fp == ''):
        with open(source_fp, encoding='UTF-8') as source:
            for x in tqdm(source, total=n_lines):
                src, mask_src, segments_ids, clss, mask_cls = _process_src(x)
                segs = torch.tensor(segments_ids)[None, :].to(device)
                batch = Batch()
                batch.src = src
                batch.tgt = None
                batch.mask_src = mask_src
                batch.mask_tgt = None
                batch.segs = segs
                batch.src_str = [[
                    sent.replace('[SEP]', '').strip()
                    for sent in x.split('[CLS]')
                ]]
                batch.tgt_str = ['']
                batch.clss = clss
                batch.mask_cls = mask_cls

                batch.batch_size = 1
                yield batch
    else:
        with open(source_fp, encoding='UTF-8') as source, open(
                target_fp, encoding='UTF-8') as target:
            for x, y in tqdm(zip(source, target), total=n_lines):
                x = x.strip()
                y = y.strip()
                y = ' '.join(y.split())
                src, mask_src, segments_ids, clss, mask_cls = _process_src(x)
                segs = torch.tensor(segments_ids)[None, :].to(device)
                batch = Batch()
                batch.src = src
                batch.tgt = None
                batch.mask_src = mask_src
                batch.mask_tgt = None
                batch.segs = segs
                batch.src_str = [[
                    sent.replace('[SEP]', '').strip()
                    for sent in x.split('[CLS]')
                ]]
                batch.tgt_str = [y]
                batch.clss = clss
                batch.mask_cls = mask_cls
                batch.batch_size = 1
                yield batch
Example #16
0
def train_abs_single(args, device_id):
    init_logger(args.log_file)
    logger.info(str(args))
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if (k in model_flags):
                setattr(args, k, opt[k])
    else:
        checkpoint = None

    if (args.load_from_extractive != ''):
        logger.info('Loading bert from extractive model %s' %
                    args.load_from_extractive)
        bert_from_extractive = torch.load(
            args.load_from_extractive,
            map_location=lambda storage, loc: storage)
        bert_from_extractive = bert_from_extractive['model']
    else:
        bert_from_extractive = None
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        return data_loader.Dataloader(args,
                                      load_dataset(args, 'train',
                                                   shuffle=True),
                                      args.batch_size,
                                      device,
                                      shuffle=True,
                                      is_test=False)

    model = AbsSummarizer(args, device, checkpoint, bert_from_extractive)
    if (args.sep_optim):
        optim_bert = model_builder.build_optim_bert(args, model, checkpoint)
        optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
        optim = [optim_bert, optim_dec]
    else:
        optim = [model_builder.build_optim(args, model, checkpoint)]

    logger.info(model)

    #tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False, cache_dir=args.temp_dir)
    #tokenizer = BertTokenizer.from_pretrained('hubert-wiki', do_lower_case=False, cache_dir=None)
    #tokenizer = BertTokenizer.from_pretrained('hubert-web', do_lower_case=False, cache_dir=None)
    tokenizer = BertTokenizer.from_pretrained('libert-large',
                                              do_lower_case=False,
                                              cache_dir=None)

    symbols = {
        'BOS': tokenizer.vocab['[unused5]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }

    train_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          device,
                          train=True,
                          label_smoothing=args.label_smoothing)

    trainer = build_trainer(args, device_id, model, optim, train_loss)

    trainer.train(train_iter_fct, args.train_steps)
Example #17
0
def load_text(args, source_fp, target_fp, device):
    from others.tokenization import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    sep_vid = tokenizer.vocab['[SEP]']
    cls_vid = tokenizer.vocab['[CLS]']
    n_lines = len(open(source_fp).read().split('\n'))
# 根据论文进项相应处理
    def _process_src(raw):
        raw = raw.strip().lower()
        raw = raw.replace('[cls]','[CLS]').replace('[sep]','[SEP]') # 都替换为大写.
        src_subtokens = tokenizer.tokenize(raw)
        src_subtokens = ['[CLS]'] + src_subtokens + ['[SEP]'] # 加上收尾token
        src_subtoken_idxs = tokenizer.convert_tokens_to_ids(src_subtokens)
        src_subtoken_idxs = src_subtoken_idxs[:-1][:args.max_pos]
        src_subtoken_idxs[-1] = sep_vid
        _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == sep_vid]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        segments_ids = []# 计算segment编码, 得到0,1 向量而已.
        segs = segs[:args.max_pos]
        for i, s in enumerate(segs):
            if (i % 2 == 0):
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]

        src = torch.tensor(src_subtoken_idxs)[None, :].to(device)
        mask_src = (1 - (src == 0).float()).to(device)#去掉pad
        cls_ids = [[i for i, t in enumerate(src_subtoken_idxs) if t == cls_vid]] # 句子编号
        clss = torch.tensor(cls_ids).to(device)
        mask_cls = 1 - (clss == -1).float()
        clss[clss == -1] = 0

        return src, mask_src, segments_ids, clss, mask_cls

    if(target_fp==''):
        with open(source_fp) as source:
            for x in tqdm(source, total=n_lines):
                src, mask_src, segments_ids, clss, mask_cls = _process_src(x)
                segs = torch.tensor(segments_ids)[None, :].to(device)
                batch = Batch()
                batch.src  = src
                batch.tgt  = None
                batch.mask_src  = mask_src
                batch.mask_tgt  = None
                batch.segs  = segs
                batch.src_str  =  [[sent.replace('[SEP]','').strip() for sent in x.split('[CLS]')]]
                batch.tgt_str  = ['']
                batch.clss  = clss
                batch.mask_cls  = mask_cls

                batch.batch_size=1
                yield batch
    else:
        with open(source_fp) as source, open(target_fp) as target:
            for x, y in tqdm(zip(source, target), total=n_lines):
                x = x.strip()
                y = y.strip()
                y = ' '.join(y.split())
                src, mask_src, segments_ids, clss, mask_cls = _process_src(x)
                segs = torch.tensor(segments_ids)[None, :].to(device)
                batch = Batch()
                batch.src  = src
                batch.tgt  = None
                batch.mask_src  = mask_src
                batch.mask_tgt  = None
                batch.segs  = segs
                batch.src_str  =  [[sent.replace('[SEP]','').strip() for sent in x.split('[CLS]')]]
                batch.tgt_str  = [y]
                batch.clss  = clss
                batch.mask_cls  = mask_cls
                batch.batch_size=1
                yield batch
Example #18
0
#coding=utf8

import sys
sys.path.append('../src/')
from others.tokenization import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
use_bert_basic_tokenizer=False

def build_dict(input_file, tag):
    tokens = {'[PAD]':0, '[SEP]':1, '[CLS]':2, '[UNK]':3, '[unused0]':4, '[unused1]':5, '[unused2]':6}
    for line in open(input_file):
        sentences = line.strip().split('\t')
        if tag == 'src':
            sentences = sentences[:-1]
        for sent in sentences:
            for tok in sent.split(' '):
                tok = tok.lower()
                if tok not in tokens:
                    tokens[tok] = len(tokens)
    return tokens

def build_dict_bert(input_file, tag):
    tokens = {'[PAD]':0, '[SEP]':1, '[CLS]':2, '[UNK]':3, '[unused0]':4, '[unused1]':5, '[unused2]':6}
    for line in open(input_file):
        sentences = line.strip().split('\t')
        if tag == 'src':
            sentences = sentences[:-1]
        for sent in sentences:
            sent = sent.lower()
            sub_tokens = tokenizer.tokenize(sent, use_bert_basic_tokenizer=use_bert_basic_tokenizer)