Ejemplo n.º 1
0
def convert_multiple_choice_examples_to_features(examples: list,
                                                 tokenizer: BertTokenizer,
                                                 max_seq_length: int,
                                                 is_training: bool,
                                                 verbose: bool = False):
    features = []
    for idx, example in enumerate(examples):
        option_features = []
        for option in example.get_option_segments():
            context_tokens = tokenizer.tokenize(option['segment1'])
            if "segment2" in option:
                option_tokens = tokenizer.tokenize(option["segment2"])
                _truncate_seq_pair(context_tokens, option_tokens,
                                   max_seq_length - 3)
                tokens = ["[CLS]"] + context_tokens + [
                    "[SEP]"
                ] + option_tokens + ["[SEP]"]
                segment_ids = [0] * (len(context_tokens) +
                                     2) + [1] * (len(option_tokens) + 1)
            else:
                context_tokens = context_tokens[0:(max_seq_length - 2)]
                tokens = ["[CLS]"] + context_tokens + ["[SEP]"]
                segment_ids = [0] * len(tokens)

            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)

            padding = [0] * (max_seq_length - len(input_ids))
            input_ids += padding
            input_mask += padding
            segment_ids += padding

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length

            option_features.append(
                (tokens, input_ids, input_mask, segment_ids))

        label = example.label

        if idx < 5 and verbose:
            logger.info("*** Example ***")
            logger.info(f"example_id: {example.example_id}")
            for choice_idx, (tokens, input_ids, input_mask,
                             segment_ids) in enumerate(option_features):
                logger.info(f"choice: {choice_idx}")
                logger.info(f"tokens: {' '.join(tokens)}")
                logger.info(f"input_ids: {' '.join(map(str, input_ids))}")
                logger.info(f"input_mask: {' '.join(map(str, input_mask))}")
                logger.info(f"segment_ids: {' '.join(map(str, segment_ids))}")
            if is_training:
                logger.info(f"label: {label}")

        features.append(
            MultipleChoiceFeatures(example_id=example.example_id,
                                   option_features=option_features,
                                   label=label))

    return features
Ejemplo n.º 2
0
class NemoBertTokenizer(TokenizerSpec):
    def __init__(self, pretrained_model=None,
                 vocab_file=None,
                 do_lower_case=True,
                 max_len=None,
                 do_basic_tokenize=True,
                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        if pretrained_model:
            self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
            if "uncased" not in pretrained_model:
                self.tokenizer.basic_tokenizer.do_lower_case = False
        else:
            self.tokenizer = BertTokenizer(vocab_file,
                                           do_lower_case,
                                           max_len,
                                           do_basic_tokenize,
                                           never_split)
        self.vocab_size = len(self.tokenizer.vocab)
        self.never_split = never_split

    def text_to_tokens(self, text):
        tokens = self.tokenizer.tokenize(text)
        return tokens

    def tokens_to_text(self, tokens):
        text = self.tokenizer.convert_tokens_to_string(tokens)
        return remove_spaces(handle_quotes(text.strip()))

    def token_to_id(self, token):
        return self.tokens_to_ids([token])[0]

    def tokens_to_ids(self, tokens):
        ids = self.tokenizer.convert_tokens_to_ids(tokens)
        return ids

    def ids_to_tokens(self, ids):
        tokens = self.tokenizer.convert_ids_to_tokens(ids)
        return tokens

    def text_to_ids(self, text):
        tokens = self.text_to_tokens(text)
        ids = self.tokens_to_ids(tokens)
        return ids

    def ids_to_text(self, ids):
        tokens = self.ids_to_tokens(ids)
        tokens_clean = [t for t in tokens if t not in self.never_split]
        text = self.tokens_to_text(tokens_clean)
        return text

    def pad_id(self):
        return self.tokens_to_ids(["[PAD]"])[0]

    def bos_id(self):
        return self.tokens_to_ids(["[CLS]"])[0]

    def eos_id(self):
        return self.tokens_to_ids(["[SEP]"])[0]
Ejemplo n.º 3
0
def sentence_pair_processing(data: list,tokenizer: BertTokenizer, max_sequence_length=88):
        
    max_bert_input_length = 0
    for sentence_pair in data:
        
        sentence_1_tokenized,sentence_2_tokenized = tokenizer.tokenize(sentence_pair['sentence_1']),tokenizer.tokenize(sentence_pair['sentence_2'])
        truncate_seq_pair(sentence_1_tokenized,sentence_2_tokenized,max_sequence_length-3)
        
        max_bert_input_length = max(max_bert_input_length, len(sentence_1_tokenized) + len(sentence_2_tokenized) + 3)
        sentence_pair['sentence_1_tokenized'] = sentence_1_tokenized
        sentence_pair['sentence_2_tokenized'] = sentence_2_tokenized
        
        dataset_input_ids = torch.empty((len(data), max_bert_input_length), dtype=torch.long)
        dataset_token_type_ids = torch.empty((len(data), max_bert_input_length), dtype=torch.long)
        dataset_attention_masks = torch.empty((len(data), max_bert_input_length), dtype=torch.long)
        dataset_scores = torch.empty((len(data), 1), dtype=torch.float)
        
    for idx, sentence_pair in enumerate(data):
        tokens = []
        input_type_ids = []

        tokens.append("[CLS]")
        input_type_ids.append(0)
        for token in sentence_pair['sentence_1_tokenized']:
            tokens.append(token)
            input_type_ids.append(0)
        tokens.append("[SEP]")
        input_type_ids.append(0)

        for token in sentence_pair['sentence_2_tokenized']:
            tokens.append(token)
            input_type_ids.append(1)
        tokens.append("[SEP]")
        input_type_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        attention_masks = [1] * len(input_ids)
        while len(input_ids) < max_bert_input_length:
            input_ids.append(0)
            attention_masks.append(0)
            input_type_ids.append(0)

        dataset_input_ids[idx] = torch.tensor(input_ids, dtype=torch.long)
        dataset_token_type_ids[idx] = torch.tensor(input_type_ids, dtype=torch.long)
        dataset_attention_masks[idx] = torch.tensor(attention_masks, dtype=torch.long)
        if 'similarity' not in sentence_pair or sentence_pair['similarity'] is None:
            dataset_scores[idx] = torch.tensor(float('nan'), dtype=torch.float)
        else:
            dataset_scores[idx] = torch.tensor(sentence_pair['similarity'], dtype=torch.float)

    return dataset_input_ids, dataset_token_type_ids, dataset_attention_masks, dataset_scores
Ejemplo n.º 4
0
class BertProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def __init__(self, vocab_path, do_lower_case):
        self.tokenizer = BertTokenizer(vocab_path, do_lower_case)

    def get_train(self, data_file):
        """Gets a collection of `InputExample`s for the train set."""
        return self.read_data(data_file)

    def get_dev(self, data_file):
        """Gets a collection of `InputExample`s for the dev set."""
        return self.read_data(data_file)

    def get_test(self, lines):
        return lines

    def get_labels(self):
        """Gets the list of labels for this data set."""
        return ["0", "1"]

    @classmethod
    def read_data(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        if 'pkl' in str(input_file):
            lines = load_pickle(input_file)
        else:
            lines = input_file
        return lines

    def truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def create_examples(self, lines, example_type, cached_examples_file):
        '''
        Creates examples for data
        '''
        pbar = ProgressBar(n_total=len(lines))
        if cached_examples_file.exists():
            logger.info("Loading examples from cached file %s", cached_examples_file)
            examples = torch.load(cached_examples_file)
        else:
            examples = []
            for i, line in enumerate(lines):
                guid = '%s-%d' % (example_type, i)
                text_a = line[0]
                label = line[1]
                text_b = None
                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
                examples.append(example)
                pbar.batch_step(step=i, info={}, bar_type='create examples')
            logger.info("Saving examples into cached file %s", cached_examples_file)
            torch.save(examples, cached_examples_file)
        return examples

    def create_features(self, examples, max_seq_len, cached_features_file):
        '''
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        '''
        pbar = ProgressBar(n_total=len(examples))
        if cached_features_file.exists():
            logger.info("Loading features from cached file %s", cached_features_file)
            features = torch.load(cached_features_file)
        else:
            features = []
            for ex_id, example in enumerate(examples):
                tokens_a = self.tokenizer.tokenize(example.text_a)
                tokens_b = None
                label_id = int(example.label)

                if example.text_b:
                    tokens_b = self.tokenizer.tokenize(example.text_b)
                    # Modifies `tokens_a` and `tokens_b` in place so that the total
                    # length is less than the specified length.
                    # Account for [CLS], [SEP], [SEP] with "- 3"
                    self.truncate_seq_pair(tokens_a, tokens_b, max_length=max_seq_len - 3)
                else:
                    # Account for [CLS] and [SEP] with '-2'
                    if len(tokens_a) > max_seq_len - 2:
                        tokens_a = tokens_a[:max_seq_len - 2]
                tokens = ['[CLS]'] + tokens_a + ['[SEP]']
                segment_ids = [0] * len(tokens)
                if tokens_b:
                    tokens += tokens_b + ['[SEP]']
                    segment_ids += [1] * (len(tokens_b) + 1)

                input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                input_mask = [1] * len(input_ids)
                padding = [0] * (max_seq_len - len(input_ids))
                input_len = len(input_ids)

                input_ids += padding
                input_mask += padding
                segment_ids += padding

                assert len(input_ids) == max_seq_len
                assert len(input_mask) == max_seq_len
                assert len(segment_ids) == max_seq_len

                if ex_id < 2:
                    logger.info("*** Example ***")
                    logger.info(f"guid: {example.guid}" % ())
                    logger.info(f"tokens: {' '.join([str(x) for x in tokens])}")
                    logger.info(f"input_ids: {' '.join([str(x) for x in input_ids])}")
                    logger.info(f"input_mask: {' '.join([str(x) for x in input_mask])}")
                    logger.info(f"segment_ids: {' '.join([str(x) for x in segment_ids])}")

                feature = InputFeature(input_ids=input_ids,
                                       input_mask=input_mask,
                                       segment_ids=segment_ids,
                                       label_id=label_id,
                                       input_len=input_len)
                features.append(feature)
                pbar.batch_step(step=ex_id, info={}, bar_type='create features')
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)
        return features

    def create_dataset(self, features, is_sorted=False):
        # Convert to Tensors and build dataset
        if is_sorted:
            logger.info("sorted data by th length of input")
            features = sorted(features, key=lambda x: x.input_len, reverse=True)
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        return dataset
def main(log_in_file, lm_path, lm_type, data_path, usegpu, n_fold, total_step,
         eval_every, early_stop, lr, weight_decay, lr_decay_in_layers,
         wd_decay_in_layers, max_length, max_title_rate, content_head_rate,
         batch_size, lr_scheduler_type, input_pattern, clean_method,
         warmup_rate, classifier_dropout, classifier_active, seed):
    arg_name_value_pairs = deepcopy(locals())
    prefix = time.strftime('%Y%m%d_%H%M')
    logger = logging.getLogger('default')
    formatter = logging.Formatter("%(asctime)s %(message)s")
    if log_in_file:
        handler1 = logging.FileHandler(prefix + '.log')
        handler1.setFormatter(formatter)
        handler1.setLevel(logging.DEBUG)
        logger.addHandler(handler1)
    handler2 = logging.StreamHandler()
    handler2.setFormatter(formatter)
    handler2.setLevel(logging.DEBUG)
    logger.addHandler(handler2)
    logger.setLevel(logging.DEBUG)
    for arg_name, arg_value in arg_name_value_pairs.items():
        logger.info(f'{arg_name}: {arg_value}')
    global tokenizer
    if lm_type == 'bert':
        tokenizer = BertTokenizer(os.path.join(lm_path, 'vocab.txt'))
    else:
        tokenizer = XLNetTokenizer(os.path.join(lm_path, 'spiece.model'))
        global PAD, PAD_t, CLS_t, SEP_t
        PAD_t = '<pad>'
        CLS_t = '<cls>'
        SEP_t = '<sep>'
        PAD = tokenizer.convert_tokens_to_ids([PAD_t])[0]
    logger.info(f'padding token is {PAD}')
    processed_train = preprocess(
        os.path.join(data_path, 'Train_DataSet.csv'),
        os.path.join(data_path,
                     'Train_DataSet_Label.csv'), tokenizer, max_length,
        input_pattern, clean_method, max_title_rate, content_head_rate, logger)
    processed_test = preprocess(os.path.join(data_path, 'Test_DataSet.csv'),
                                False, tokenizer, max_length, input_pattern,
                                clean_method, max_title_rate,
                                content_head_rate, logger)
    logger.info('seed everything and create model')
    seed_everything(seed)
    no_decay = ['.bias', 'LayerNorm.bias', 'LayerNorm.weight']
    if lm_type == 'xlnet':
        model = XLNetForSequenceClassification.from_pretrained(
            lm_path, num_labels=3, summary_last_dropout=classifier_dropout)
        if classifier_active == 'relu':
            model.sequence_summary.activation = nn.ReLU()
        if usegpu:
            model = model.cuda()
        model_layer_names = [
            'transformer.mask_emb', 'transformer.word_embedding.weight'
        ]
        model_layer_names += [
            f'transformer.layer.{i}.' for i in range(model.config.n_layer)
        ]
        model_layer_names += ['sequence_summary.summary', 'logits_proj']
    else:
        model = BertForSequenceClassification.from_pretrained(
            lm_path, num_labels=3, hidden_dropout_prob=classifier_dropout)
        if classifier_active == 'relu':
            model.bert.pooler.activation = nn.ReLU()
        if usegpu:
            model = model.cuda()
        model_layer_names = ['bert.embeddings']
        model_layer_names += [
            'bert.encoder.layer.{}.'.format(i)
            for i in range(model.config.num_hidden_layers)
        ]
        model_layer_names += ['bert.pooler', 'classifier']
    optimizer = optimizer = AdamW([{
        'params': [
            p for n, p in model.named_parameters()
            if layer_name in n and not any(nd in n for nd in no_decay)
        ],
        'lr':
        lr * (lr_decay_in_layers**i),
        'weight_decay':
        weight_decay * (wd_decay_in_layers**i)
    } for i, layer_name in enumerate(model_layer_names[::-1])] + [{
        'params': [
            p for n, p in model.named_parameters()
            if layer_name in n and any(nd in n for nd in no_decay)
        ],
        'lr':
        lr * (lr_decay_in_layers**i),
        'weight_decay':
        .0
    } for i, layer_name in enumerate(model_layer_names[::-1])])
    if lr_scheduler_type == 'linear':
        lr_scheduler = WarmupLinearSchedule(optimizer,
                                            warmup_steps=warmup_rate,
                                            t_total=total_step)
    elif lr_scheduler_type == 'constant':
        lr_scheduler = WarmupConstantSchedule(optimizer,
                                              warmup_steps=warmup_rate)
    else:
        raise ValueError

    model_state_0 = deepcopy(model.state_dict())
    optimizer_state_0 = deepcopy(optimizer.state_dict())

    test_iter = get_data_iter(processed_test,
                              batch_size * 4,
                              collect_test_func,
                              shuffle=False)
    pred = np.zeros((len(processed_test), 3))
    val_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(
            KFold(n_splits=n_fold, shuffle=True,
                  random_state=seed).split(processed_train)):
        model.load_state_dict(model_state_0)
        optimizer.load_state_dict(optimizer_state_0)
        if lr_scheduler_type == 'linear':
            lr_scheduler = WarmupLinearSchedule(optimizer,
                                                warmup_steps=warmup_rate,
                                                t_total=total_step)
        elif lr_scheduler_type == 'constant':
            lr_scheduler = WarmupConstantSchedule(optimizer,
                                                  warmup_steps=warmup_rate)
        else:
            raise ValueError
        train_iter = get_data_iter([processed_train[i] for i in train_idx],
                                   batch_size, collect_func)
        val_iter = get_data_iter([processed_train[i] for i in val_idx],
                                 batch_size * 4,
                                 collect_func,
                                 shuffle=False)

        best_model, best_score = training(model=model,
                                          optimizer=optimizer,
                                          lr_scheduler=lr_scheduler,
                                          train_iter=train_iter,
                                          val_iter=val_iter,
                                          total_step=total_step,
                                          tokenizer=tokenizer,
                                          usegpu=usegpu,
                                          eval_every=eval_every,
                                          logger=logger,
                                          early_stop=early_stop,
                                          fold_idx=fold_idx)
        model.load_state_dict(best_model)
        val_scores.append(best_score)
        pred += predict(model, test_iter, usegpu)
    logger.info(f'average: {np.mean(val_scores):.6f}')
    pred = pred / n_fold
    prob_df = pd.DataFrame()
    submit = pd.DataFrame()
    submit['id'] = [i['id'] for i in processed_test]
    submit['label'] = pred.argmax(-1)
    prob_df['id'] = [i['id'] for i in processed_test]
    prob_df['0'] = pred[:, 0]
    prob_df['1'] = pred[:, 1]
    prob_df['2'] = pred[:, 2]
    submit.to_csv(f'submit_{prefix}.csv', index=False)
    prob_df.to_csv(f'probability_{prefix}.csv', index=False)
Ejemplo n.º 6
0
class BertGeneration(object):
    def __init__(self, model_directory, vocab_file, lower=False):

        # Load pre-trained model (weights)

        self.model = BertForMaskedLM.from_pretrained(model_directory)
        self.model.eval()
        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.model = self.model.cuda()

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = BertTokenizer(vocab_file=vocab_file,
                                       do_lower_case=lower)

        self.CLS = '[CLS]'
        self.SEP = '[SEP]'
        self.MASK = '[MASK]'
        self.mask_id = self.tokenizer.convert_tokens_to_ids([self.MASK])[0]
        self.sep_id = self.tokenizer.convert_tokens_to_ids([self.SEP])[0]
        self.cls_id = self.tokenizer.convert_tokens_to_ids([self.CLS])[0]

    def tokenize_batch(self, batch):
        return [self.tokenizer.convert_tokens_to_ids(sent) for sent in batch]

    def untokenize_batch(self, batch):
        return [self.tokenizer.convert_ids_to_tokens(sent) for sent in batch]

    def detokenize(self, sent):
        """ Roughly detokenizes (mainly undoes wordpiece) """
        new_sent = []
        for i, tok in enumerate(sent):
            if tok.startswith("##"):
                new_sent[len(new_sent) -
                         1] = new_sent[len(new_sent) - 1] + tok[2:]
            else:
                new_sent.append(tok)
        return new_sent

    def generate_step(self,
                      out,
                      gen_idx,
                      temperature=None,
                      top_k=0,
                      sample=False,
                      return_list=True):
        """ Generate a word from from out[gen_idx]
        
        args:
            - out (torch.Tensor): tensor of logits of size batch_size x seq_len x vocab_size
            - gen_idx (int): location for which to generate for
            - top_k (int): if >0, only sample from the top k most probable words
            - sample (Bool): if True, sample from full distribution. Overridden by top_k 
        """
        logits = out[:, gen_idx]
        if temperature is not None:
            logits = logits / temperature
        if top_k > 0:
            kth_vals, kth_idx = logits.topk(top_k, dim=-1)
            dist = torch.distributions.categorical.Categorical(logits=kth_vals)
            idx = kth_idx.gather(dim=1,
                                 index=dist.sample().unsqueeze(-1)).squeeze(-1)
        elif sample:
            dist = torch.distributions.categorical.Categorical(logits=logits)
            idx = dist.sample().squeeze(-1)
        else:
            idx = torch.argmax(logits, dim=-1)
        return idx.tolist() if return_list else idx

    def get_init_text(self, seed_text, max_len, batch_size=1, rand_init=False):
        """ Get initial sentence by padding seed_text with either masks or random words to max_len """
        batch = [
            seed_text + [self.MASK] * max_len + [self.SEP]
            for _ in range(batch_size)
        ]
        #if rand_init:
        #    for ii in range(max_len):
        #        init_idx[seed_len+ii] = np.random.randint(0, len(tokenizer.vocab))

        return self.tokenize_batch(batch)

    def printer(self, sent, should_detokenize=True):
        if should_detokenize:
            sent = self.detokenize(sent)[1:-1]
        print(" ".join(sent))

    # This is the meat of the algorithm. The general idea is
    # 1. start from all masks
    # 2. repeatedly pick a location, mask the token at that location, and generate from the probability distribution given by BERT
    # 3. stop when converged or tired of waiting

    # We consider three "modes" of generating:
    # - generate a single token for a position chosen uniformly at random for a chosen number of time steps
    # - generate in sequential order (L->R), one token at a time
    # - generate for all positions at once for a chosen number of time steps

    # The `generate` function wraps and batches these three generation modes. In practice, we find that the first leads to the most fluent samples.

    # Generation modes as functions

    def parallel_sequential_generation(self,
                                       seed_text,
                                       batch_size=10,
                                       max_len=15,
                                       top_k=0,
                                       temperature=None,
                                       max_iter=300,
                                       burnin=200,
                                       cuda=False,
                                       print_every=10,
                                       verbose=True):
        """ Generate for one random position at a timestep
        
        args:
            - burnin: during burn-in period, sample from full distribution; afterwards take argmax
        """
        seed_len = len(seed_text)
        batch = self.get_init_text(seed_text, max_len, batch_size)

        for ii in range(max_iter):
            kk = np.random.randint(0, max_len)
            for jj in range(batch_size):
                batch[jj][seed_len + kk] = self.mask_id
            inp = torch.tensor(batch).cuda() if cuda else torch.tensor(batch)
            out = self.model(inp)[0]
            topk = top_k if (ii >= burnin) else 0
            idxs = self.generate_step(out,
                                      gen_idx=seed_len + kk,
                                      top_k=topk,
                                      temperature=temperature,
                                      sample=(ii < burnin))
            for jj in range(batch_size):
                batch[jj][seed_len + kk] = idxs[jj]

            if verbose and np.mod(ii + 1, print_every) == 0:
                for_print = self.tokenizer.convert_ids_to_tokens(batch[0])
                for_print = for_print[:seed_len + kk + 1] + [
                    '(*)'
                ] + for_print[seed_len + kk + 1:]
                print("iter", ii + 1, " ".join(for_print))

        return self.untokenize_batch(batch)

    def parallel_generation(self,
                            seed_text,
                            batch_size=10,
                            max_len=15,
                            top_k=0,
                            temperature=None,
                            max_iter=300,
                            sample=True,
                            cuda=False,
                            print_every=10,
                            verbose=True):
        """ Generate for all positions at each time step """
        seed_len = len(seed_text)
        batch = self.get_init_text(seed_text, max_len, batch_size)

        for ii in range(max_iter):
            inp = torch.tensor(batch).cuda() if cuda else torch.tensor(batch)
            out = self.model(inp)[0]
            for kk in range(max_len):
                idxs = self.generate_step(out,
                                          gen_idx=seed_len + kk,
                                          top_k=top_k,
                                          temperature=temperature,
                                          sample=sample)
                for jj in range(batch_size):
                    batch[jj][seed_len + kk] = idxs[jj]

            if verbose and np.mod(ii, print_every) == 0:
                print("iter", ii + 1,
                      " ".join(self.tokenizer.convert_ids_to_tokens(batch[0])))

        return self.untokenize_batch(batch)

    def sequential_generation(self,
                              seed_text,
                              batch_size=10,
                              max_len=15,
                              leed_out_len=15,
                              top_k=0,
                              temperature=None,
                              sample=True,
                              cuda=False):
        """ Generate one word at a time, in L->R order """
        seed_len = len(seed_text)
        batch = self.get_init_text(seed_text, max_len, batch_size)

        for ii in range(max_len):
            inp = [
                sent[:seed_len + ii + leed_out_len] + [self.sep_id]
                for sent in batch
            ]
            inp = torch.tensor(batch).cuda() if cuda else torch.tensor(batch)
            out = self.model(inp)[0]
            idxs = self.generate_step(out,
                                      gen_idx=seed_len + ii,
                                      top_k=top_k,
                                      temperature=temperature,
                                      sample=sample)
            for jj in range(batch_size):
                batch[jj][seed_len + ii] = idxs[jj]

        return self.untokenize_batch(batch)

    def generate(self,
                 n_samples,
                 seed_text="[CLS]",
                 batch_size=10,
                 max_len=25,
                 generation_mode="parallel-sequential",
                 sample=True,
                 top_k=100,
                 temperature=1.0,
                 burnin=200,
                 max_iter=500,
                 cuda=False,
                 print_every=1,
                 leed_out_len=15):
        # main generation function to call
        sentences = []
        n_batches = math.ceil(n_samples / batch_size)
        start_time = time.time()
        for batch_n in range(n_batches):
            if generation_mode == "parallel-sequential":
                batch = self.parallel_sequential_generation(
                    seed_text,
                    batch_size=batch_size,
                    max_len=max_len,
                    top_k=top_k,
                    temperature=temperature,
                    burnin=burnin,
                    max_iter=max_iter,
                    cuda=cuda,
                    verbose=False)
            elif generation_mode == "sequential":
                batch = self.sequential_generation(seed_text,
                                                   batch_size=batch_size,
                                                   max_len=max_len,
                                                   top_k=top_k,
                                                   temperature=temperature,
                                                   leed_out_len=leed_out_len,
                                                   sample=sample,
                                                   cuda=cuda)
            elif generation_mode == "parallel":
                batch = self.parallel_generation(seed_text,
                                                 batch_size=batch_size,
                                                 max_len=max_len,
                                                 top_k=top_k,
                                                 temperature=temperature,
                                                 sample=sample,
                                                 max_iter=max_iter,
                                                 cuda=cuda,
                                                 verbose=False)

            if (batch_n + 1) % print_every == 0:
                print("Finished batch %d in %.3fs" %
                      (batch_n + 1, time.time() - start_time))
                start_time = time.time()

            sentences += batch
        return sentences
Ejemplo n.º 7
0
class BertDetector(object):
    def __init__(self,
                 bert_model_dir=config.bert_model_dir,
                 bert_model_vocab=config.bert_model_vocab,
                 threshold=0.1):
        self.name = 'bert_detector'
        self.bert_model_dir = bert_model_dir
        self.bert_model_vocab = bert_model_vocab
        self.initialized_bert_detector = False
        self.threshold = threshold

    def check_bert_detector_initialized(self):
        if not self.initialized_bert_detector:
            self.initialize_bert_detector()

    def initialize_bert_detector(self):
        t1 = time.time()
        self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab)
        self.MASK_TOKEN = "[MASK]"
        self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids(
            [self.MASK_TOKEN])[0]
        # Prepare model
        self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
        logger.debug("Loaded model ok, path: %s, spend: %.3f s." %
                     (self.bert_model_dir, time.time() - t1))
        self.initialized_bert_detector = True

    def _convert_sentence_to_detect_features(self, sentence):
        """Loads a sentence into a list of `InputBatch`s."""
        self.check_bert_detector_initialized()
        features = []
        tokens = self.bert_tokenizer.tokenize(sentence)
        token_ids = self.bert_tokenizer.convert_tokens_to_ids(tokens)
        for idx, token_id in enumerate(token_ids):
            masked_lm_labels = [-1] * len(token_ids)
            masked_lm_labels[idx] = token_id
            features.append(
                InputFeatures(input_ids=token_ids,
                              masked_lm_labels=masked_lm_labels,
                              input_tokens=tokens,
                              id=idx,
                              token=tokens[idx]))
        return features

    def predict_token_prob(self, sentence):
        self.check_bert_detector_initialized()
        result = []
        eval_features = self._convert_sentence_to_detect_features(sentence)

        for f in eval_features:
            input_ids = torch.tensor([f.input_ids])
            masked_lm_labels = torch.tensor([f.masked_lm_labels])
            outputs = self.model(input_ids, masked_lm_labels=masked_lm_labels)
            masked_lm_loss, predictions = outputs[:2]
            prob = np.exp(-masked_lm_loss.item())
            result.append([prob, f])
        return result

    def detect(self, sentence):
        """
        句子改错
        :param sentence: 句子文本
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        maybe_errors = []
        for prob, f in self.predict_token_prob(sentence):
            logger.debug('prob:%s, token:%s, idx:%s' % (prob, f.token, f.id))
            if prob < self.threshold:
                maybe_errors.append([f.token, f.id, f.id + 1, ErrorType.char])
        return maybe_errors
Ejemplo n.º 8
0
def main(args):

    assert args.use_one_optim is True

    if args.recover_e > 0:
        raise NotImplementedError("This option is from my oldest code version. "
                                  "I have not checked it for this code version.")

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)
        print("### mkdir {:}".format(args.save_dir))

    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    n_gpu = 0
    if torch.cuda.is_available() and (not args.use_cpu):
        n_gpu = torch.cuda.device_count()
        device = torch.device('cuda')
        print("### Device: {:}".format(device))
    else:
        print("### Use CPU (Debugging)")
        device = torch.device("cpu")

    if args.random_seed < 0:
        print("### Pick a random seed")
        args.random_seed = random.sample(list(range(1, 100000)), 1)[0]

    print("### Random Seed: {:}".format(args.random_seed))
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)

    if n_gpu > 0:
        if args.random_seed >= 0:
            torch.cuda.manual_seed(args.random_seed)
            torch.cuda.manual_seed_all(args.random_seed)

        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    ontology = json.load(open(args.ontology_data))
    slot_meta, ontology = make_slot_meta(ontology)
    op2id = OP_SET[args.op_code]
    print(op2id)

    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_path = os.path.join(args.data_root, "train.pt")
    train_data_raw = torch.load(train_path)[:5000]
    print("# train examples %d" % len(train_data_raw))

    test_path = os.path.join(args.data_root, "test.pt")
    test_data_raw = torch.load(test_path)
    print("# test examples %d" % len(test_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob

    type_vocab_size = 4
    dec_config = args
    model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id),
                           op2id['update'],
                           tokenizer.convert_tokens_to_ids(['[MASK]'])[0],
                           tokenizer.convert_tokens_to_ids(['[SEP]'])[0],
                           tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                           tokenizer.convert_tokens_to_ids(['-'])[0],
                           type_vocab_size, args.exclude_domain)

    test_epochs = [int(e) for e in args.load_epoch.strip().lower().split('-')]
    for best_epoch in test_epochs:
        print("### Epoch {:}...".format(best_epoch))
        sys.stdout.flush()
        ckpt_path = os.path.join(args.save_dir, 'model.e{:}.bin'.format(best_epoch))
        ckpt = torch.load(ckpt_path, map_location='cpu')
        model.load_state_dict(ckpt)
        model.to(device)

        # eval_res = model_evaluation(model, train_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
        #                             use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, n_gpu=n_gpu,
        #                             is_gt_op=False, is_gt_p_state=False, is_gt_gen=False)
        #
        # print("### Epoch {:} Train Score : ".format(best_epoch), eval_res)
        # print('\n'*2)
        # sys.stdout.flush()

        eval_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                                    use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, n_gpu=n_gpu,
                                    is_gt_op=False, is_gt_p_state=False, is_gt_gen=False)

        print("### Epoch {:} Test Score : ".format(best_epoch), eval_res)
        print('\n'*2)
        sys.stdout.flush()
Ejemplo n.º 9
0
class BertProcessor(object):
    def __init__(self, vocab_path, do_lower_case, min_freq_words=None):
        self.tokenizer = BertTokenizer(vocab_file=vocab_path, do_lower_case=do_lower_case)

    def get_labels(self):
        return ['[CLS]', '[SEP]', 'O',
                'B-NIHSS', 'B-Measurement', 'B-TemporalConstraint', 'B-1a_LOC', 'B-1b_LOCQuestions', 'B-1c_LOCCommands', 'B-2_BestGaze', 'B-3_Visual', 'B-4_FacialPalsy', 'B-56_Motor', 'B-5_Motor', 'B-5a_LeftArm', 'B-5b_RightArm', 'B-6_Motor', 'B-6a_LeftLeg', 'B-6b_RightLeg', 'B-7_LimbAtaxia', 'B-8_Sensory', 'B-9_BestLanguage', 'B-10_Dysarthria', 'B-11_ExtinctionInattention',
                'I-NIHSS', 'I-Measurement', 'I-TemporalConstraint', 'I-1a_LOC', 'I-1b_LOCQuestions', 'I-1c_LOCCommands', 'I-2_BestGaze', 'I-3_Visual', 'I-4_FacialPalsy', 'I-56_Motor', 'I-5_Motor', 'I-5a_LeftArm', 'I-5b_RightArm', 'I-6_Motor', 'I-6a_LeftLeg', 'I-6b_RightLeg', 'I-7_LimbAtaxia', 'I-8_Sensory', 'I-9_BestLanguage', 'I-10_Dysarthria', 'I-11_ExtinctionInattention'
               ]

    def create_examples(self, lines, example_type):
        examples = []
        for i, line in enumerate(lines):
            hadm_id = line['HADM_ID']
            guid = '%s-%s-%d' % (example_type, hadm_id, i)
            sentence = line['token'] # list
            sentence = [' ' if type(t)==float else t for t in sentence ]
            label = line['tags']  # list
            code = line['code']
            # text_a: string. The untokenized text of the first sequence. For single
            # sequence tasks, only this sequence must be specified.
            text_a = ' '.join(sentence) # string
            text_b = None
            examples.append(InputExample(guid=guid, text_a=text_a,text_b=text_b, label=label, code=code))
        return examples

    def create_features(self, examples, max_seq_len):
        label_list = self.get_labels()
        label2id = {label:i for i, label in enumerate(label_list)}

        features = []
        for example_id, example in enumerate(examples): # examples:
            text_list = example.text_a.split(' ')  # string
            label_list = example.label
            code_list = example.code

            new_tokens = [] # tokens
            new_segment_ids =[]
            new_label_ids = []
            new_code = []

            new_tokens.append('[CLS]')
            new_segment_ids.append(0)
            new_label_ids.append(label2id['[CLS]'])
            new_code.append('0')
            
            for text, label, code in zip(text_list, label_list, code_list):
                if text == '<CRLF>':
                    continue
                else:
                    token_list = self.tokenizer.tokenize(text)
                    for idx, token in enumerate(token_list):
                        new_tokens.append(token)
                        new_segment_ids.append(0)
                        if idx == 0:
                            new_label_ids.append(label2id[label])
                            new_code.append(code)
                        elif label == 'O':
                            new_label_ids.append(label2id[label])
                            new_code.append(code)
                        else:
                            temp_l = 'I-'+label.split('-')[1]
                            new_label_ids.append(label2id[temp_l])
                            new_code.append(code)

            assert len(new_tokens) == len(new_segment_ids)
            assert len(new_tokens) == len(new_label_ids)
            assert len(new_tokens) == len(new_code)

            if len(new_tokens) >= max_seq_len :
                new_tokens = new_tokens[0:(max_seq_len-1)]
                new_segment_ids = new_segment_ids[0:(max_seq_len-1)]
                new_label_ids = new_label_ids[0:(max_seq_len-1)]
                new_code = new_code[0:(max_seq_len-1)]

            new_tokens.append('[SEP]')
            new_segment_ids.append(0)
            new_label_ids.append(label2id['[SEP]'])
            new_code.append('0')

            input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)
            input_mask = [1] * len(input_ids)
            input_len = len(new_label_ids)

            if len(input_ids) < max_seq_len:
                pad_zero = [0] * (max_seq_len - len(input_ids))
                input_ids.extend(pad_zero)
                input_mask.extend(pad_zero)
                new_segment_ids.extend(pad_zero)
                new_label_ids.extend(pad_zero)
                new_code.extend(['0']* len(pad_zero))

            assert len(input_ids) == max_seq_len
            assert len(input_mask) == max_seq_len
            assert len(new_segment_ids) == max_seq_len
            assert len(new_label_ids) == max_seq_len
            assert len(new_code) == max_seq_len

            df_temp = pd.DataFrame({'input_ids':input_ids, 'code':new_code})
            agg_fun = lambda s: ( max(s['code']), s.index.tolist()[0], s.index.tolist()[-1])
            groupby_code = df_temp.groupby('code').apply(agg_fun)
            code_position = {}
            for key, start, end in groupby_code:
                if key != '0':
                    code_position[key] = (start, end)
                else:
                    continue

            features.append(
                InputFeature(
                    input_ids = input_ids,
                    input_mask = input_mask,
                    segment_ids = new_segment_ids,
                    label_id = new_label_ids,
                    input_len = input_len,
                    code = new_code,
                    code_position = code_position
            ))

        return features
Ejemplo n.º 10
0
class RuleBertWordDetector(object):
    def __init__(self,
                 language_model_path=config.language_model_path,
                 word_freq_path=config.word_freq_path,
                 char_freq_path=config.char_freq_path,
                 custom_word_freq_path=config.custom_word_freq_path,
                 custom_confusion_path=config.custom_confusion_path,
                 person_name_path=config.person_name_path,
                 place_name_path=config.place_name_path,
                 stopwords_path=config.stopwords_path,
                 bert_model_dir=config.bert_model_dir,
                 bert_model_vocab=config.bert_model_vocab,
                 threshold=0.1):
        self.name = 'rule_bert_word_detector'
        self.language_model_path = language_model_path
        self.word_freq_path = word_freq_path
        self.char_freq_path = char_freq_path
        self.custom_word_freq_path = custom_word_freq_path
        self.custom_confusion_path = custom_confusion_path
        self.person_name_path = person_name_path
        self.place_name_path = place_name_path
        self.stopwords_path = stopwords_path
        self.is_char_error_detect = True
        self.is_word_error_detect = True
        self.is_redundancy_miss_error_detect = True
        self.initialized_detector = False
        self.bert_model_dir = bert_model_dir
        self.bert_model_vocab = bert_model_vocab
        self.threshold = threshold

    def initialize_detector(self):
        t1 = time.time()
        try:
            import kenlm
        except ImportError:
            raise ImportError(
                'mypycorrector dependencies are not fully installed, '
                'they are required for statistical language model.'
                'Please use "pip install kenlm" to install it.'
                'if you are Win, Please install kenlm in cgwin.')

        self.lm = kenlm.Model(self.language_model_path)
        logger.debug('Loaded language model: %s, spend: %s s' %
                     (self.language_model_path, str(time.time() - t1)))

        # 词、频数dict
        t2 = time.time()
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        self.char_freq = self.load_char_freq_dict(self.char_freq_path)
        t3 = time.time()
        logger.debug(
            'Loaded word freq, char freq file: %s, size: %d, spend: %s s' %
            (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        # bert预训练模型
        t6 = time.time()
        self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab)
        self.MASK_TOKEN = "[MASK]"
        self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids(
            [self.MASK_TOKEN])[0]
        # Prepare model
        self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
        logger.debug("Loaded model ok, path: %s, spend: %.3f s." %
                     (self.bert_model_dir, time.time() - t6))
        self.initialized_detector = True

    def check_detector_initialized(self):
        if not self.initialized_detector:
            self.initialize_detector()

    def enable_char_error(self, enable=True):
        """
        is open char error detect
        :param enable:
        :return:
        """
        self.is_char_error_detect = enable

    def enable_word_error(self, enable=True):
        """
        is open word error detect
        :param enable:
        :return:
        """
        self.is_word_error_detect = enable

    def enable_redundancy_miss_error(self, enable=True):
        '''
        @Descripttion: is open redundancy miss error detect
        @param enable 
        @return: 
        '''
        self.is_redundancy_miss_error_detect = enable

    def _convert_sentence_to_detect_features(self, sentence):
        """Loads a sentence into a list of `InputBatch`s."""
        self.check_detector_initialized()
        features = []
        tokens = self.bert_tokenizer.tokenize(sentence)
        token_ids = self.bert_tokenizer.convert_tokens_to_ids(tokens)
        for idx, token_id in enumerate(token_ids):
            masked_lm_labels = [-1] * len(token_ids)
            masked_lm_labels[idx] = token_id
            features.append(
                InputFeatures(input_ids=token_ids,
                              masked_lm_labels=masked_lm_labels,
                              input_tokens=tokens,
                              id=idx,
                              token=tokens[idx]))
        return features

    # bert 预测可能的错误字
    def predict_token_prob(self, sentence):
        self.check_detector_initialized()
        result = []
        eval_features = self._convert_sentence_to_detect_features(sentence)

        for f in eval_features:
            input_ids = torch.tensor([f.input_ids])
            masked_lm_labels = torch.tensor([f.masked_lm_labels])
            outputs = self.model(input_ids, masked_lm_labels=masked_lm_labels)
            masked_lm_loss, predictions = outputs[:2]
            prob = np.exp(-masked_lm_loss.item())
            result.append([prob, f])
        return result

    @staticmethod
    def load_word_freq_dict(path):
        """
        加载切词词典
        :param path:
        :return:
        """
        word_freq = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split()
                if len(info) < 1:
                    continue
                word = info[0]
                # 取词频,默认1
                freq = int(info[1]) if len(info) > 1 else 1
                word_freq[word] = freq
        return word_freq

    @staticmethod
    def load_char_freq_dict(path):
        """
        加载常用字碎片词典
        :param path:
        :return:
        """
        char_freq = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split()
                if len(info) < 1:
                    continue
                char = info[0]
                # 取词频,默认1
                freq = int(info[1]) if len(info) > 1 else 1
                char_freq[char] = freq
        return char_freq

    def _get_custom_confusion_dict(self, path):
        """
        取自定义困惑集
        :param path:
        :return: dict, {variant: origin}, eg: {"交通先行": "交通限行"}
        """
        confusion = {}
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    continue
                info = line.split()
                if len(info) < 2:
                    continue
                variant = info[0]
                origin = info[1]
                freq = int(info[2]) if len(info) > 2 else 1
                self.word_freq[origin] = freq
                confusion[variant] = origin
        return confusion

    def ngram_score(self, chars):
        """
        取n元文法得分
        :param chars: list, 以词或字切分
        :return:
        """
        self.check_detector_initialized()
        return self.lm.score(' '.join(chars), bos=False, eos=False)

    def ppl_score(self, words):
        """
        取语言模型困惑度得分,越小句子越通顺
        :param words: list, 以词或字切分
        :return:
        """
        self.check_detector_initialized()
        return self.lm.perplexity(' '.join(words))

    def word_frequency(self, word):
        """
        取词在样本中的词频
        :param word:
        :return:
        """
        self.check_detector_initialized()
        return self.word_freq.get(word, 0)

    def set_word_frequency(self, word, num):
        """
        更新在样本中的词频
        """
        self.check_detector_initialized()
        self.word_freq[word] = num
        return self.word_freq

    @staticmethod
    def _check_contain_error(maybe_err, maybe_errors):
        """
        检测错误集合(maybe_errors)是否已经包含该错误位置(maybe_err)
        :param maybe_err: [error_word, begin_pos, end_pos, error_type]
        :param maybe_errors:
        :return:
        """
        error_word_idx = 0
        begin_idx = 1
        end_idx = 2
        for err in maybe_errors:
            if maybe_err[error_word_idx] in err[error_word_idx] and maybe_err[begin_idx] >= err[begin_idx] and \
                    maybe_err[end_idx] <= err[end_idx]:
                return True
        return False

    def _add_maybe_error_item(self, maybe_err, maybe_errors):
        """
        新增错误
        :param maybe_err:
        :param maybe_errors:
        :return:
        """
        if maybe_err not in maybe_errors and not self._check_contain_error(
                maybe_err, maybe_errors):
            maybe_errors.append(maybe_err)

    @staticmethod
    def is_filter_token(token):
        result = False
        # pass blank
        if not token.strip():
            result = True
        # pass punctuation
        if token in PUNCTUATION_LIST:
            result = True
        # pass num
        if token.isdigit():
            result = True
        # pass alpha
        if is_alphabet_string(token.lower()):
            result = True
        return result

    def detect(self, sentence):
        """
        检测句子中的疑似错误信息,包括[词、位置、错误类型]
        :param sentence:
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        maybe_errors = []
        if not sentence.strip():
            return maybe_errors
        # 初始化
        self.check_detector_initialized()
        # 文本归一化
        sentence = uniform(sentence)
        # 切词
        tokens = self.tokenizer.tokenize(sentence)
        self.tokens = [token[0] for token in tokens]
        # print(tokens)
        # 自定义混淆集加入疑似错误词典
        # for confuse in self.custom_confusion:
        #     idx = sentence.find(confuse)
        #     if idx > -1:
        #         maybe_err = [confuse, idx, idx +
        #                      len(confuse), ErrorType.confusion]
        #         self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_word_error_detect:
            # 未登录词加入疑似错误词典
            for word, begin_idx, end_idx in tokens:
                # pass filter word
                if self.is_filter_token(word):
                    continue
                # pass in dict
                if word in self.word_freq:
                    if self.is_redundancy_miss_error_detect:
                        # 多字词或词频大于50000的单字,可以continue
                        if len(
                                word
                        ) == 1 and word in self.char_freq and self.char_freq.get(
                                word) < 50000:
                            maybe_err = [
                                word, begin_idx, end_idx, ErrorType.word_char
                            ]
                            self._add_maybe_error_item(maybe_err, maybe_errors)
                            continue
                        # 出现叠字,考虑是否多字
                        if len(word) == 1 and sentence[begin_idx - 1] == word:
                            maybe_err = [
                                word, begin_idx, end_idx, ErrorType.redundancy
                            ]
                            self._add_maybe_error_item(maybe_err, maybe_errors)
                            continue
                    continue
                # 对碎片单字进行检测,可能多字、少字、错字
                if self.is_redundancy_miss_error_detect:
                    if len(word) == 1:
                        maybe_err = [
                            word, begin_idx, end_idx, ErrorType.word_char
                        ]
                        self._add_maybe_error_item(maybe_err, maybe_errors)
                        continue
                maybe_err = [word, begin_idx, end_idx, ErrorType.word]
                self._add_maybe_error_item(maybe_err, maybe_errors)

        if self.is_char_error_detect:
            # 语言模型检测疑似错误字
            try:
                for prob, f in self.predict_token_prob(sentence):
                    # logger.debug('prob:%s, token:%s, idx:%s' % (prob, f.token, f.id))
                    if prob < self.threshold:
                        maybe_err = [f.token, f.id, f.id + 1, ErrorType.char]
                        self._add_maybe_error_item(maybe_err, maybe_errors)
                # return maybe_errors
            except IndexError as ie:
                logger.warn("index error, sentence:" + sentence + str(ie))
            except Exception as e:
                logger.warn("detect error, sentence:" + sentence + str(e))
        return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
Ejemplo n.º 11
0
class BertGeneration(object):
    def __init__(self, model_directory, vocab_file, lower=False):

        # Load pre-trained model (weights)

        self.model = BertForMaskedLM.from_pretrained(model_directory)
        self.model.eval()
        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.model = self.model.cuda()

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = BertTokenizer(vocab_file=vocab_file,
                                       do_lower_case=lower)

        self.CLS = '[CLS]'
        self.SEP = '[SEP]'
        self.MASK = '[MASK]'
        self.mask_id = self.tokenizer.convert_tokens_to_ids([self.MASK])[0]
        self.sep_id = self.tokenizer.convert_tokens_to_ids([self.SEP])[0]
        self.cls_id = self.tokenizer.convert_tokens_to_ids([self.CLS])[0]

    def tokenize_batch(self, batch):
        return [self.tokenizer.convert_tokens_to_ids(sent) for sent in batch]

    def untokenize_batch(self, batch):
        return [self.tokenizer.convert_ids_to_tokens(sent) for sent in batch]

    def detokenize(self, sent):
        """ Roughly detokenizes (mainly undoes wordpiece) """
        new_sent = []
        for i, tok in enumerate(sent):
            if tok.startswith("##"):
                new_sent[len(new_sent) -
                         1] = new_sent[len(new_sent) - 1] + tok[2:]
            else:
                new_sent.append(tok)
        return new_sent

    def printer(self, sent, should_detokenize=True):
        if should_detokenize:
            sent = self.detokenize(sent)[1:-1]
        print(" ".join(sent))

    def predict_masked(self, sent):
        tokens = ['[CLS]'] + sent + ['[SEP]']
        target_indices = [i for i, x in enumerate(tokens) if x == '[MASK]']
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tens = torch.LongTensor(input_ids).unsqueeze(0)
        if self.cuda:
            tens = tens.cuda()
        try:
            res = self.model(tens)[0]
        except RuntimeError:  # Error in the model vocabulary, remove when a corret model is trained
            return None
        target_tensor = torch.LongTensor(target_indices)
        if self.cuda:
            target_tensor = target_tensor.cuda()
        res = (torch.index_select(res, 1, target_tensor))
        res = torch.narrow(torch.argsort(res, dim=-1, descending=True), -1, 0,
                           5)

        predicted = []
        for mask in res[0, ]:
            candidates = self.tokenizer.convert_ids_to_tokens(
                [i.item() for i in mask])

            predicted.append(candidates)

        return predicted
Ejemplo n.º 12
0
def preprocess_data(entities_json,
                    article_texts,
                    tokenizer: BertTokenizer,
                    for_train: bool = True):
    """
    [{
        'sent': xxx, 'entity_name': yyy, 'entity_type': zzz, 'start_token_id': 0, 'end_token_id': 5,
        'start_index': 0, 'end_index': 2, 
            'sent_tokens': ['token1', 'token2'], 'entity_tokens': ['token3', 'token4']
    }]
    """

    preprocessed_datas = []

    all_sents = []
    for article in tqdm.tqdm([Article(t) for t in article_texts]):
        for para_text in article.para_texts:
            for sent in article.split_into_sentence(para_text):
                sent_tokens = list(sent)
                entity_labels = []
                for entity_type, entities in entities_json.items():
                    for entity_name in entities:
                        if entity_name not in sent:
                            continue
                        all_sents.append(sent)
                        start_end_indexes = _find_all_start_end(
                            sent, entity_name)
                        assert len(start_end_indexes) >= 1
                        for str_start_index, str_end_index in start_end_indexes:
                            entity_tokens = list(entity_name)

                            one_entity_label = {
                                'entity_type': entity_type,
                                'start_token_id': str_start_index,
                                'end_token_id': str_end_index,
                                'start_index': str_start_index,
                                'end_index': str_end_index,
                                'entity_tokens': entity_tokens,
                                'entity_name': entity_name
                            }
                            entity_labels.append(one_entity_label)

                if not entity_labels:
                    tags = [O for _ in range(len(sent_tokens))]
                    tag_ids = [tag2idx[O] for _ in range(len(sent_tokens))]
                    if for_train:
                        continue
                else:
                    tags = []
                    tag_ids = []
                    for sent_token_index in range(len(sent_tokens)):
                        tag = O
                        for entity_label in entity_labels:
                            if sent_token_index == entity_label[
                                    'start_token_id']:
                                tag = f'B-{chinese_entity_type_vs_english_entity_type[entity_label["entity_type"]]}'
                            elif entity_label[
                                    'start_token_id'] < sent_token_index < entity_label[
                                        "end_token_id"]:
                                tag = f'I-{chinese_entity_type_vs_english_entity_type[entity_label["entity_type"]]}'
                        tag_id = tag2idx[tag]
                        tags.append(tag)
                        tag_ids.append(tag_id)
                assert len(sent_tokens) == len(tags) == len(tag_ids)
                not_o_indexes = [
                    index for index, tag in enumerate(tags) if tag != O
                ]
                all_entities = [sent_tokens[index] for index in not_o_indexes]
                all_entities2 = entity_labels

                preprocessed_datas.append({
                    'sent':
                    sent,
                    'sent_tokens':
                    sent_tokens,
                    'sent_token_ids':
                    tokenizer.convert_tokens_to_ids(sent_tokens),
                    'entity_labels':
                    entity_labels,
                    'tags':
                    tags,
                    'tag_ids':
                    tag_ids
                })
    return preprocessed_datas
Ejemplo n.º 13
0
    temp_lable.append('[SEP]')
    temp_token.append('[SEP]')
    
    tokenized_texts.append(temp_token)
    word_piece_labels.append(temp_lable)
    
    if 5 > i_inc:
        print("No.%d,len:%d"%(i_inc,len(temp_token)))
        print("texts:%s"%(" ".join(temp_token)))
        print("No.%d,len:%d"%(i_inc,len(temp_lable)))
        print("lables:%s"%(" ".join(temp_lable)))
    i_inc +=1



input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", truncating="post", padding="post")


tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                     maxlen=max_len, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")


attention_masks = [[int(i>0) for i in ii] for ii in input_ids]

segment_ids = [[0] * len(input_id) for input_id in input_ids]


tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(input_ids, tags,attention_masks,segment_ids, 
                                                            random_state=4, test_size=0.20)
Ejemplo n.º 14
0
class BertProcessor(object):
    def __init__(self, vocab_path, do_lower_case, min_freq_words=None):
        self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                       do_lower_case=do_lower_case)

    def get_labels(self):
        return [
            '[CLS]', '[SEP]', 'O', 'B-NIHSS', 'B-Measurement', 'B-1a_LOC',
            'B-1b_LOCQuestions', 'B-1c_LOCCommands', 'B-2_BestGaze',
            'B-3_Visual', 'B-4_FacialPalsy', 'B-5_Motor', 'B-5a_LeftArm',
            'B-5b_RightArm', 'B-6_Motor', 'B-6a_LeftLeg', 'B-6b_RightLeg',
            'B-7_LimbAtaxia', 'B-8_Sensory', 'B-9_BestLanguage',
            'B-10_Dysarthria', 'B-11_ExtinctionInattention', 'I-NIHSS',
            'I-Measurement', 'I-1a_LOC', 'I-1b_LOCQuestions',
            'I-1c_LOCCommands', 'I-2_BestGaze', 'I-3_Visual',
            'I-4_FacialPalsy', 'I-5_Motor', 'I-5a_LeftArm', 'I-5b_RightArm',
            'I-6_Motor', 'I-6a_LeftLeg', 'I-6b_RightLeg', 'I-7_LimbAtaxia',
            'I-8_Sensory', 'I-9_BestLanguage', 'I-10_Dysarthria',
            'I-11_ExtinctionInattention'
        ]

    @classmethod
    def read_data(cls, input_file, quotechar=None):
        if 'pkl' in str(input_file):
            lines = tools.load_pickle(input_file)
        else:
            lines = input_file
        return lines

    def get_train(self, data_file):
        return self.read_data(data_file)

    def get_valid(self, data_file):
        return self.read_data(data_file)

    def get_test(self, data_file):
        return self.read_data(data_file)

    def create_examples(self, lines, example_type, cached_file):
        if cached_file.exists():
            tools.logger.info("Loading samples from cached files %s",
                              cached_file)
            examples = torch.load(cached_file)
        else:
            pbar = progressbar.ProgressBar(
                n_total=len(lines), desc=f'create {example_type} samples')
            examples = []
            for i, line in enumerate(lines):
                hadm_id = line['HADM_ID']
                guid = '%s-%s-%d' % (example_type, hadm_id, i)
                sentence = line['token']  # list
                sentence = [' ' if type(t) == float else t for t in sentence]
                label = line['tags']  # list
                code = line['code']  # brat entity Tcode T1 T2
                relations = line['relations']  # brat relations golden standard
                # text_a: string. The untokenized text of the first sequence. For single
                # sequence tasks, only this sequence must be specified.
                text_a = ' '.join(sentence)  # string
                text_b = None
                examples.append(
                    InputExample(guid=guid,
                                 text_a=text_a,
                                 text_b=text_b,
                                 label=label,
                                 code=code,
                                 relations=relations,
                                 hadm_id=hadm_id))
                pbar(step=i)
            tools.logger.info("Saving examples into cached file %s",
                              cached_file)
            torch.save(examples, cached_file)
        return examples

    def create_features(self, examples, max_seq_len, cached_file):
        if cached_file.exists():
            tools.logger.info('Loading features from cached file %s',
                              cached_file)
            features = torch.load(cached_file)
        else:
            label_list = self.get_labels()
            label2id = {label: i for i, label in enumerate(label_list)}
            pbar = progressbar.ProgressBar(
                n_total=len(examples),
                desc='creating the specified features of examples')
            features = []
            for example_id, example in enumerate(examples):
                hamd_id = example.hadm_id
                text_list = example.text_a.split(' ')  # string
                idx_CR = [
                    idx for idx, text in enumerate(text_list)
                    if text == '<CRLF>'
                ]
                label_list = example.label
                code_list = example.code
                relation_list = example.relations

                new_tokens = []
                new_segment_ids = []
                new_label_ids = []
                new_code = []

                new_tokens.append('[CLS]')
                new_segment_ids.append(0)
                new_label_ids.append(label2id['[CLS]'])
                new_code.append('0')

                for text, label, code in zip(text_list, label_list, code_list):
                    if text == '<CRLF>':
                        continue
                    else:
                        token_list = self.tokenizer.tokenize(text)
                        for idx, token in enumerate(token_list):
                            new_tokens.append(token)
                            new_segment_ids.append(0)
                            if idx == 0:
                                new_label_ids.append(label2id[label])
                                new_code.append(code)
                            elif label == 'O':
                                new_label_ids.append(label2id[label])
                                new_code.append(code)
                            else:
                                temp_l = 'I-' + label.split('-')[1]
                                new_label_ids.append(label2id[temp_l])
                                new_code.append(code)

                assert len(new_tokens) == len(new_segment_ids)
                assert len(new_tokens) == len(new_label_ids)
                assert len(new_tokens) == len(new_code)

                if len(new_tokens) >= max_seq_len:
                    new_tokens = new_tokens[0:(max_seq_len - 1)]
                    new_segment_ids = new_segment_ids[0:(max_seq_len - 1)]
                    new_label_ids = new_label_ids[0:(max_seq_len - 1)]
                    new_code = new_code[0:(max_seq_len - 1)]

                new_tokens.append('[SEP]')
                new_segment_ids.append(0)
                new_label_ids.append(label2id['[SEP]'])
                new_code.append('0')

                input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)
                input_mask = [1] * len(input_ids)
                input_len = len(new_label_ids)

                if len(input_ids) < max_seq_len:
                    pad_zero = [0] * (max_seq_len - len(input_ids))
                    input_ids.extend(pad_zero)
                    input_mask.extend(pad_zero)
                    new_segment_ids.extend(pad_zero)
                    new_label_ids.extend(pad_zero)
                    new_code.extend(['0'] * len(pad_zero))

                assert len(input_ids) == max_seq_len
                assert len(input_mask) == max_seq_len
                assert len(new_segment_ids) == max_seq_len
                assert len(new_label_ids) == max_seq_len
                assert len(new_code) == max_seq_len

                df_temp = pd.DataFrame({
                    'input_ids': input_ids,
                    'code': new_code
                })
                agg_fun = lambda s: (max(s['code']), s.index.tolist()[0],
                                     s.index.tolist()[-1])
                groupby_code = df_temp.groupby('code').apply(agg_fun)
                code_position = {}
                for key, start, end in groupby_code:
                    if key != '0':
                        code_position[(start - 1, end - 1)] = key
                    else:
                        continue

                if example_id < 2:
                    tools.logger.info('*** Examples: ***')
                    tools.logger.info("guid: %s" % (example.guid))
                    tools.logger.info("tokens: %s" %
                                      " ".join([str(x) for x in new_tokens]))
                    tools.logger.info("input_ids: %s" %
                                      " ".join([str(x) for x in input_ids]))
                    tools.logger.info("input_mask: %s" %
                                      " ".join([str(x) for x in input_mask]))
                    tools.logger.info(
                        "segment_ids: %s" %
                        " ".join([str(x) for x in new_segment_ids]))
                    tools.logger.info("old label name: %s " %
                                      " ".join(example.label))
                    tools.logger.info("new label ids: %s" %
                                      " ".join([str(x)
                                                for x in new_label_ids]))

                features.append(
                    InputFeature(
                        input_ids=input_ids,
                        input_mask=input_mask,
                        segment_ids=new_segment_ids,
                        label_id=new_label_ids,
                        input_len=input_len,
                        code=new_code,
                        new_tokens=new_tokens,
                        relations=relation_list,  # golden standard
                        hamd_id=hamd_id,
                        code_position=code_position))

                pbar(step=example_id)

            tools.logger.info('Saving features into cached file %s',
                              cached_file)
            torch.save(features, cached_file)
        return features

    def create_dataset(self, features, is_sorted=False):
        if is_sorted:
            tools.logger.info('sorted data by the length of input')
            features = sorted(features,
                              key=lambda x: x.input_len,
                              reverse=True)
        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
        all_input_lens = torch.tensor([f.input_len for f in features],
                                      dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids, all_input_lens)

        return dataset
def bert_sentence_pair_preprocessing(data: list,
                                     tokenizer: BertTokenizer,
                                     max_sequence_length=128):
    """
    Pre-processes an array of sentence pairs for input into bert. Sentence pairs are expected to be processed
    as given in data.py.

    Each sentence pair is tokenized and concatenated together by the [SEP] token for input into BERT

    :return: three tensors: [data_size, input_ids], [data_size, token_type_ids], [data_size, attention_mask]
    """

    max_bert_input_length = 0
    for sentence_pair in data:

        sentence_1_tokenized, sentence_2_tokenized = tokenizer.tokenize(
            sentence_pair['sentence_1']), tokenizer.tokenize(
                sentence_pair['sentence_2'])
        _truncate_seq_pair(sentence_1_tokenized, sentence_2_tokenized,
                           max_sequence_length -
                           3)  #accounting for positioning tokens

        max_bert_input_length = max(
            max_bert_input_length,
            len(sentence_1_tokenized) + len(sentence_2_tokenized) + 3)
        sentence_pair['sentence_1_tokenized'] = sentence_1_tokenized
        sentence_pair['sentence_2_tokenized'] = sentence_2_tokenized

    dataset_input_ids = torch.empty((len(data), max_bert_input_length),
                                    dtype=torch.long)
    dataset_token_type_ids = torch.empty((len(data), max_bert_input_length),
                                         dtype=torch.long)
    dataset_attention_masks = torch.empty((len(data), max_bert_input_length),
                                          dtype=torch.long)
    dataset_scores = torch.empty((len(data), 1), dtype=torch.float)

    for idx, sentence_pair in enumerate(data):
        tokens = []
        input_type_ids = []

        tokens.append("[CLS]")
        input_type_ids.append(0)
        for token in sentence_pair['sentence_1_tokenized']:
            tokens.append(token)
            input_type_ids.append(0)
        tokens.append("[SEP]")
        input_type_ids.append(0)

        for token in sentence_pair['sentence_2_tokenized']:
            tokens.append(token)
            input_type_ids.append(1)
        tokens.append("[SEP]")
        input_type_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        attention_masks = [1] * len(input_ids)
        while len(input_ids) < max_bert_input_length:
            input_ids.append(0)
            attention_masks.append(0)
            input_type_ids.append(0)

        dataset_input_ids[idx] = torch.tensor(input_ids, dtype=torch.long)
        dataset_token_type_ids[idx] = torch.tensor(input_type_ids,
                                                   dtype=torch.long)
        dataset_attention_masks[idx] = torch.tensor(attention_masks,
                                                    dtype=torch.long)
        if 'similarity' not in sentence_pair or sentence_pair[
                'similarity'] is None:
            dataset_scores[idx] = torch.tensor(float('nan'), dtype=torch.float)
        else:
            dataset_scores[idx] = torch.tensor(sentence_pair['similarity'],
                                               dtype=torch.float)

    return dataset_input_ids, dataset_token_type_ids, dataset_attention_masks, dataset_scores
Ejemplo n.º 16
0
def main(args):

    assert args.use_one_optim is True

    if args.use_cls_only:
        args.no_dial = True

    print("### use_cls_only: {:}".format(args.use_cls_only))
    print("### no_dial: {:}".format(args.no_dial))

    if args.recover_e > 0:
        raise NotImplementedError("This option is from my oldest code version. "
                                  "I have not checked it for this code version.")

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)
        print("### mkdir {:}".format(args.save_dir))

    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    n_gpu = 0
    if torch.cuda.is_available() and (not args.use_cpu):
        n_gpu = torch.cuda.device_count()
        device = torch.device('cuda')
        print("### Device: {:}".format(device))
    else:
        print("### Use CPU (Debugging)")
        device = torch.device("cpu")

    if args.random_seed < 0:
        print("### Pick a random seed")
        args.random_seed = random.sample(list(range(0, 100000)), 1)[0]

    print("### Random Seed: {:}".format(args.random_seed))
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)

    if n_gpu > 0:
        if args.random_seed >= 0:
            torch.cuda.manual_seed(args.random_seed)
            torch.cuda.manual_seed_all(args.random_seed)

        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    ontology = json.load(open(args.ontology_data))
    slot_meta, ontology = make_slot_meta(ontology)
    op2id = OP_SET[args.op_code]
    print(op2id)

    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_path = os.path.join(args.data_root, "train.pt")
    dev_path = os.path.join(args.data_root, "dev.pt")
    test_path = os.path.join(args.data_root, "test.pt")

    if not os.path.exists(test_path):
        test_data_raw = prepare_dataset(data_path=args.test_data_path,
                                        tokenizer=tokenizer,
                                        slot_meta=slot_meta,
                                        n_history=args.n_history,
                                        max_seq_length=args.max_seq_length,
                                        op_code=args.op_code)
        torch.save(test_data_raw, test_path)
    else:
        test_data_raw = torch.load(test_path)

    print("# test examples %d" % len(test_data_raw))

    if not os.path.exists(train_path):
        train_data_raw = prepare_dataset(data_path=args.train_data_path,
                                         tokenizer=tokenizer,
                                         slot_meta=slot_meta,
                                         n_history=args.n_history,
                                         max_seq_length=args.max_seq_length,
                                         op_code=args.op_code)

        torch.save(train_data_raw, train_path)
    else:
        train_data_raw = torch.load(train_path)

    train_data = MultiWozDataset(train_data_raw,
                                 tokenizer,
                                 slot_meta,
                                 args.max_seq_length,
                                 rng,
                                 ontology,
                                 args.word_dropout,
                                 args.shuffle_state,
                                 args.shuffle_p, pad_id=tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                                 slot_id=tokenizer.convert_tokens_to_ids(['[SLOT]'])[0],
                                 decoder_teacher_forcing=args.decoder_teacher_forcing,
                                 use_full_slot=args.use_full_slot,
                                 use_dt_only=args.use_dt_only, no_dial=args.no_dial,
                                 use_cls_only=args.use_cls_only)

    print("# train examples %d" % len(train_data_raw))

    if not os.path.exists(dev_path):
        dev_data_raw = prepare_dataset(data_path=args.dev_data_path,
                                       tokenizer=tokenizer,
                                       slot_meta=slot_meta,
                                       n_history=args.n_history,
                                       max_seq_length=args.max_seq_length,
                                       op_code=args.op_code)
        torch.save(dev_data_raw,  dev_path)
    else:
        dev_data_raw = torch.load(dev_path)

    print("# dev examples %d" % len(dev_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob

    type_vocab_size = 4
    dec_config = args
    model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id),
                           op2id['update'],
                           tokenizer.convert_tokens_to_ids(['[MASK]'])[0],
                           tokenizer.convert_tokens_to_ids(['[SEP]'])[0],
                           tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                           tokenizer.convert_tokens_to_ids(['-'])[0],
                           type_vocab_size, args.exclude_domain)

    if not os.path.exists(args.bert_ckpt_path):
        args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets')

    state_dict = torch.load(args.bert_ckpt_path, map_location='cpu')
    _k = 'embeddings.token_type_embeddings.weight'
    print("config.type_vocab_size != state_dict[bert.embeddings.token_type_embeddings.weight] ({0} != {1})".format(
            type_vocab_size, state_dict[_k].shape[0]))
    state_dict[_k].resize_(
        type_vocab_size, state_dict[_k].shape[1])
    state_dict[_k].data[2, :].copy_(state_dict[_k].data[0, :])
    state_dict[_k].data[3, :].copy_(state_dict[_k].data[0, :])
    model.bert.load_state_dict(state_dict)
    print("\n### Done Load BERT")
    sys.stdout.flush()

    # re-initialize added special tokens ([SLOT], [NULL], [EOS])
    model.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)

    # re-initialize seg-2, seg-3
    model.bert.embeddings.token_type_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.token_type_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)
    model.to(device)

    num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs)

    if args.use_one_optim:
        print("### Use One Optim")
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.enc_lr)
        scheduler = WarmupLinearSchedule(optimizer, int(num_train_steps * args.enc_warmup),
                                             t_total=num_train_steps)
    else:
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        enc_param_optimizer = list(model.bert.named_parameters())  # TODO: For BERT only
        print('### Optim BERT: {:}'.format(len(enc_param_optimizer)))
        enc_optimizer_grouped_parameters = [
            {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr)
        enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup),
                                             t_total=num_train_steps)

        dec_param_optimizer = list(model.named_parameters())  # TODO:  For other parameters
        print('### Optim All: {:}'.format(len(dec_param_optimizer)))
        dec_param_optimizer = [p for (n, p) in dec_param_optimizer if 'bert' not in n]
        print('### Optim OTH: {:}'.format(len(dec_param_optimizer)))
        dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr)
        dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup),
                                             t_total=num_train_steps)

    if args.recover_e > 0:
        model_recover, enc_recover, dec_recover = load(args, str(args.recover_e))
        print("### Recover Model E{:}".format(args.recover_e))
        sys.stdout.flush()
        model.load_state_dict(model_recover)
        print("### Recover Optim E{:}".format(args.recover_e))
        sys.stdout.flush()
        enc_optimizer.load_state_dict(enc_recover)
        dec_optimizer.load_state_dict(dec_optimizer)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size,
                                  collate_fn=train_data.collate_fn,
                                  num_workers=args.num_workers,
                                  worker_init_fn=worker_init_fn)

    loss_fnc = nn.CrossEntropyLoss()
    best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0}

    start_time = time.time()

    for epoch in range(args.n_epochs):
        batch_loss = []
        model.train()
        for step, batch in enumerate(train_dataloader):

            batch = [b.to(device) if (not isinstance(b, int)) and (not isinstance(b, dict) and (not isinstance(b, list)) and (not isinstance(b, np.ndarray))) else b for b in batch]

            input_ids_p, segment_ids_p, input_mask_p, \
            state_position_ids, op_ids, domain_ids, input_ids_g, segment_ids_g, position_ids_g, input_mask_g, \
            masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, n_total_pred = batch

            domain_scores, state_scores, loss_g = model(input_ids_p, segment_ids_p, input_mask_p, state_position_ids,
                input_ids_g, segment_ids_g, position_ids_g, input_mask_g,
                masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, only_pred_op=args.only_pred_op, n_gpu=n_gpu)

            if n_total_pred > 0:
                loss_g = loss_g.sum() / n_total_pred
            else:
                loss_g = 0

            loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1))

            if args.only_pred_op:
                loss = loss_s
            else:
                loss = loss_s + loss_g

            if args.exclude_domain is not True:
                loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1))
                loss = loss + loss_d

            batch_loss.append(loss.item())

            loss.backward()

            if args.use_one_optim:
                optimizer.step()
                scheduler.step()
            else:
                enc_optimizer.step()
                enc_scheduler.step()
                dec_optimizer.step()
                dec_scheduler.step()

            model.zero_grad()

            if step % 100 == 0:
                try:
                    loss_g = loss_g.item()
                except AttributeError:
                    loss_g = loss_g

                if args.exclude_domain is not True:
                    print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \
                          % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g, loss_d.item()))
                else:
                    print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \
                          % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g))

                sys.stdout.flush()
                batch_loss = []

        if args.use_one_optim:
            save(args, epoch + 1, model, optimizer)
        else:
            save(args, epoch + 1, model, enc_optimizer, dec_optimizer)

        if ((epoch+1) % args.eval_epoch == 0) and (epoch+1 >= 8):
            eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code,
                                        use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu)
            print("### Epoch {:} Score : ".format(epoch+1), eval_res)

            if eval_res['joint_acc'] > best_score['joint_acc']:
                best_score = eval_res
                print("### Best Joint Acc: {:} ###".format(best_score['joint_acc']))
                print('\n')

                if epoch+1 >= 8:  # To speed up
                    eval_res_test = model_evaluation(model, test_data_raw, tokenizer, slot_meta, epoch + 1, args.op_code,
                                                     use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu)
                    print("### Epoch {:} Test Score : ".format(epoch + 1), eval_res_test)
Ejemplo n.º 17
0
def tag_sent(text):
    # initialize variables
    num_tags = 24  # depends on the labelling scheme
    max_len = 45
    vocabulary = "bert_models/vocab.txt"
    bert_out_address = 'bert/model'

    tokenizer = BertTokenizer(vocab_file=vocabulary, do_lower_case=False)

    model = BertForTokenClassification.from_pretrained(bert_out_address,
                                                       num_labels=num_tags)

    f = open('se_data/tags.txt')
    lines = f.readlines()
    f.close()

    tag2idx = {}
    for line in lines:
        key = line.split()[0]
        val = line.split()[1]
        tag2idx[key.strip()] = int(val.strip())

    tag2name = {tag2idx[key]: key for key in tag2idx.keys()}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()

    if torch.cuda.is_available():
        model.cuda()
        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

    model.eval()

    tokenized_texts = []
    word_piece_labels = []
    i_inc = 0

    temp_token = []

    # Add [CLS] at the front
    temp_token.append('[CLS]')

    for word in nltk.word_tokenize(text):
        token_list = tokenizer.tokenize(word)
        for m, token in enumerate(token_list):
            temp_token.append(token)

    # Add [SEP] at the end
    temp_token.append('[SEP]')

    tokenized_texts.append(temp_token)

    #if 5 > i_inc:
    #print("No.%d,len:%d"%(i_inc,len(temp_token)))
    #print("texts:%s"%(" ".join(temp_token)))
    #i_inc +=1

    input_ids = pad_sequences(
        [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
        maxlen=max_len,
        dtype="long",
        truncating="post",
        padding="post")

    attention_masks = [[int(i > 0) for i in ii] for ii in input_ids]
    #attention_masks[0];

    segment_ids = [[0] * len(input_id) for input_id in input_ids]
    #segment_ids[0];

    tr_inputs = torch.tensor(input_ids).to(device)
    tr_masks = torch.tensor(attention_masks).to(device)
    tr_segs = torch.tensor(segment_ids).to(device)

    outputs = model(
        tr_inputs,
        token_type_ids=None,
        attention_mask=tr_masks,
    )

    #tr_masks = tr_masks.to('cpu').numpy()

    logits = outputs[0]

    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
    logits = logits.detach().cpu().numpy()

    #print(logits)
    #print(len(logits[0]))
    tags_t = [tag2name[t] for t in logits[0]]

    #print(nltk.word_tokenize(text))
    c = len(tokenized_texts[0])
    #print(tags_t[:c])
    return tokenized_texts[0][1:len(temp_token) -
                              1], tags_t[:c][1:len(tags_t[:c]) - 1]