Exemple #1
0
def preprocess_raw_example(
    rawe: RawExample,
    tokenizer: BertTokenizer,
    cond_tokenizer: spm.SentencePieceProcessor,
) -> Tuple[str, Example]:

    e = Example(
        title_token_ids=tokenizer.encode(rawe.title, add_special_tokens=False),
        description_token_ids=tokenizer.encode(rawe.description,
                                               add_special_tokens=False),
        condition_token_ids=cond_tokenizer.EncodeAsIds(rawe.condition),
        fact_token_ids=tokenizer.encode(rawe.fact, add_special_tokens=False),
        description=rawe.description,
    )
    return hashlib.sha1(json.dumps(e.__dict__).encode()).hexdigest(), e
Exemple #2
0
def predict(session: InferenceSession, tokenizer: BertTokenizer, text):
    tokens = tokenizer(text, return_attention_mask=True, return_tensors="pt")
    inputs_onnx = {k: np.atleast_2d(v) for k, v in tokens.items()}
    entities = session.run(None, inputs_onnx)[0].squeeze(0)

    input_ids = tokens["input_ids"][0]
    score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
    labels_idx = score.argmax(axis=-1)

    entities = []
    # Filter to labels not in `self.ignore_labels`
    filtered_labels_idx = [
        (idx, label_idx) for idx, label_idx in enumerate(labels_idx)
        if config['id2label'][str(label_idx)] not in IGNORE_LABELS
    ]

    for idx, label_idx in filtered_labels_idx:
        entity = {
            "word": tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
            "score": score[idx][label_idx].item(),
            "entity": config['id2label'][str(label_idx)],
            "index": idx
        }

        entities += [entity]
    answers = []

    answers += [group_entities(entities, tokenizer)]

    answers = answers[0] if len(answers) == 1 else answers
    return render_ner_html_custom(text, answers, colors=colors)
Exemple #3
0
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        domain_identifier: str = None,
        bert_model_name: str = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        if token_indexers is not None:
            self._token_indexers = token_indexers
        elif bert_model_name is not None:
            from allennlp.data.token_indexers import PretrainedTransformerIndexer

            self._token_indexers = {
                "tokens": PretrainedTransformerIndexer(bert_model_name)
            }
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}
        self._domain_identifier = domain_identifier

        if bert_model_name is not None:
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                bert_model_name)
            self.lowercase_input = "uncased" in bert_model_name
        else:
            self.bert_tokenizer = None
            self.lowercase_input = False
Exemple #4
0
    def __init__(self, args):
        super(KobeModel, self).__init__()

        self.encoder = Encoder(
            vocab_size=args.text_vocab_size + args.cond_vocab_size,
            max_seq_len=args.max_seq_len,
            d_model=args.d_model,
            nhead=args.nhead,
            num_layers=args.num_encoder_layers,
            dropout=args.dropout,
            mode=args.mode,
        )
        self.decoder = Decoder(
            vocab_size=args.text_vocab_size,
            max_seq_len=args.max_seq_len,
            d_model=args.d_model,
            nhead=args.nhead,
            num_layers=args.num_decoder_layers,
            dropout=args.dropout,
        )
        self.lr = args.lr
        self.d_model = args.d_model
        self.loss = nn.CrossEntropyLoss(reduction="mean",
                                        ignore_index=0,
                                        label_smoothing=0.1)
        self._reset_parameters()

        self.decoding_strategy = args.decoding_strategy
        self.vocab = BertTokenizer.from_pretrained(args.text_vocab_path)
        self.bleu = BLEU(tokenize=args.tokenize)
        self.sacre_tokenizer = _get_tokenizer(args.tokenize)()
        self.bert_scorer = BERTScorer(lang=args.tokenize,
                                      rescale_with_baseline=True)
Exemple #5
0
    def setUp(self):
        super().setUp()

        self.tokenizers = [BertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
        self.tf_tokenizers = [TFBertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
        self.test_sentences = [
            "This is a straightforward English test sentence.",
            "This one has some weird characters\rto\nsee\r\nif  those\u00E9break things.",
            "Now we're going to add some Chinese: 一 二 三 一二三",
            "And some much more rare Chinese: 齉 堃 齉堃",
            "Je vais aussi écrire en français pour tester les accents",
            "Classical Irish also has some unusual characters, so in they go: Gaelaċ, ꝼ",
        ]
        self.paired_sentences = list(zip(self.test_sentences, self.test_sentences[::-1]))
Exemple #6
0
def main(args):
    # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
    # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp)
    with open(args.file_name, "r", encoding="utf-8") as f:
        data = f.readlines()
    data = [
        line.strip() for line in data if len(line) > 0 and not line.isspace()
    ]  # avoid delimiter like '\u2029'
    ltp_tokenizer = LTP(args.ltp)  # faster in GPU device
    bert_tokenizer = BertTokenizer.from_pretrained(args.bert)

    ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer)

    with open(args.save_path, "w", encoding="utf-8") as f:
        data = [json.dumps(ref) + "\n" for ref in ref_ids]
        f.writelines(data)
Exemple #7
0
    def setup_method(self):

        self.monkeypatch = MonkeyPatch()
        # monkeypatch the PretrainedBertModel to return the tiny test fixture model
        config_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "config.json"
        vocab_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "vocab.txt"
        config = BertConfig.from_json_file(config_path)
        self.monkeypatch.setattr(BertModel, "from_pretrained", lambda _: BertModel(config))
        self.monkeypatch.setattr(
            BertTokenizer, "from_pretrained", lambda _: BertTokenizer(vocab_path)
        )

        super().setup_method()
        self.set_up_model(
            FIXTURES_ROOT / "structured_prediction" / "srl" / "bert_srl.jsonnet",
            FIXTURES_ROOT / "structured_prediction" / "srl" / "conll_2012",
        )
Exemple #8
0
def prepare_ref(lines: List[str], ltp_tokenizer: LTP,
                bert_tokenizer: BertTokenizer):
    ltp_res = []

    for i in range(0, len(lines), 100):
        res = ltp_tokenizer.seg(lines[i:i + 100])[0]
        res = [get_chinese_word(r) for r in res]
        ltp_res.extend(res)
    assert len(ltp_res) == len(lines)

    bert_res = []
    for i in range(0, len(lines), 100):
        res = bert_tokenizer(lines[i:i + 100],
                             add_special_tokens=True,
                             truncation=True,
                             max_length=512)
        bert_res.extend(res["input_ids"])
    assert len(bert_res) == len(lines)

    ref_ids = []
    for input_ids, chinese_word in zip(bert_res, ltp_res):

        input_tokens = []
        for id in input_ids:
            token = bert_tokenizer._convert_id_to_token(id)
            input_tokens.append(token)
        input_tokens = add_sub_symbol(input_tokens, chinese_word)
        ref_id = []
        # We only save pos of chinese subwords start with ##, which mean is part of a whole word.
        for i, token in enumerate(input_tokens):
            if token[:2] == "##":
                clean_token = token[2:]
                # save chinese tokens' pos
                if len(clean_token) == 1 and _is_chinese_char(
                        ord(clean_token)):
                    ref_id.append(i)
        ref_ids.append(ref_id)

    assert len(ref_ids) == len(bert_res)

    return ref_ids
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    ## Other parameters
    parser.add_argument("--cache_dir",
                        default="",
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=1e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")


    args = parser.parse_args()


    processors = {
        "rte": RteProcessor
    }

    output_modes = {
        "rte": "classification"
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")


    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))



    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    threeway_train_examples, threeway_dev_examples = processor.get_MNLI_train_and_dev('/export/home/Dataset/glue_data/MNLI/train.tsv', ['/export/home/Dataset/glue_data/MNLI/dev_mismatched.tsv', '/export/home/Dataset/glue_data/MNLI/dev_matched.tsv'])
    '''preprocessing: binary classification, randomly sample 20k for testing data'''
    train_examples = []
    for ex in threeway_train_examples:
        if ex.label == 'neutral' or ex.label == 'contradiction':
            ex.label = 'neutral'
        train_examples.append(ex)
    # train_examples = train_examples[:100]
    dev_examples = []
    for ex in threeway_dev_examples:
        if ex.label == 'neutral' or ex.label == 'contradiction':
            ex.label = 'neutral'
        dev_examples.append(ex)
    random.shuffle(dev_examples)
    test_examples = dev_examples[:13000]
    dev_examples = dev_examples[13000:]

    label_list = ["entailment", "neutral"]#, "contradiction"]
    num_labels = len(label_list)
    print('num_labels:', num_labels, 'training size:', len(train_examples), 'dev size:', len(dev_examples), ' test size:', len(test_examples))

    num_train_optimization_steps = None
    num_train_optimization_steps = int(
        len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    model = BertForSequenceClassification(num_labels)
    tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    optimizer = AdamW(optimizer_grouped_parameters,
                             lr=args.learning_rate)
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    max_test_acc = 0.0
    max_dev_acc = 0.0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer, output_mode,
            cls_token_at_end=False,#bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,#2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=True,#bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=False,#bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=0)#4 if args.model_type in ['xlnet'] else 0,)

        '''load dev set'''
        dev_features = convert_examples_to_features(
            dev_examples, label_list, args.max_seq_length, tokenizer, output_mode,
            cls_token_at_end=False,#bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,#2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=True,#bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=False,#bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=0)#4 if args.model_type in ['xlnet'] else 0,)

        dev_all_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long)
        dev_all_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long)
        dev_all_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long)
        dev_all_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long)

        dev_data = TensorDataset(dev_all_input_ids, dev_all_input_mask, dev_all_segment_ids, dev_all_label_ids)
        dev_sampler = SequentialSampler(dev_data)
        dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.eval_batch_size)


        '''load test set'''
        test_features = convert_examples_to_features(
            test_examples, label_list, args.max_seq_length, tokenizer, output_mode,
            cls_token_at_end=False,#bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,#2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=True,#bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=False,#bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=0)#4 if args.model_type in ['xlnet'] else 0,)

        test_all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
        test_all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
        test_all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
        test_all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long)

        test_data = TensorDataset(test_all_input_ids, test_all_input_mask, test_all_segment_ids, test_all_label_ids)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size)


        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        train_sampler = RandomSampler(train_data)

        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        iter_co = 0
        final_test_performance = 0.0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                model.train()
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch


                logits = model(input_ids, input_mask)
                loss_fct = CrossEntropyLoss()

                loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
                iter_co+=1

            '''
            start evaluate on dev set after this epoch
            '''
            model.eval()

            # eval_loss = 0
            # nb_eval_steps = 0
            # preds = []
            # gold_label_ids = []
            # # print('Evaluating...')
            # for input_ids, input_mask, segment_ids, label_ids in dev_dataloader:
            #     input_ids = input_ids.to(device)
            #     input_mask = input_mask.to(device)
            #     segment_ids = segment_ids.to(device)
            #     label_ids = label_ids.to(device)
            #     gold_label_ids+=list(label_ids.detach().cpu().numpy())
            #
            #     with torch.no_grad():
            #         logits = model(input_ids, input_mask)
            #     if len(preds) == 0:
            #         preds.append(logits.detach().cpu().numpy())
            #     else:
            #         preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0)
            #
            # preds = preds[0]
            #
            # pred_probs = softmax(preds,axis=1)
            # pred_label_ids = list(np.argmax(pred_probs, axis=1))
            #
            # gold_label_ids = gold_label_ids
            # assert len(pred_label_ids) == len(gold_label_ids)
            # hit_co = 0
            # for k in range(len(pred_label_ids)):
            #     if pred_label_ids[k] == gold_label_ids[k]:
            #         hit_co +=1
            # test_acc = hit_co/len(gold_label_ids)

            dev_acc = evaluation(dev_dataloader, device, model)

            if dev_acc > max_dev_acc:
                max_dev_acc = dev_acc
                print('\ndev acc:', dev_acc, ' max_dev_acc:', max_dev_acc, '\n')
                '''evaluate on the test set with the best dev model'''
                final_test_performance = evaluation(test_dataloader, device, model)
                print('\ntest acc:', final_test_performance,  '\n')

            else:
                print('\ndev acc:', dev_acc, ' max_dev_acc:', max_dev_acc, '\n')
        print('final_test_performance:', final_test_performance)
Exemple #10
0
 def get_tokenizer(self, **kwargs):
     return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
Exemple #11
0
                        default="icod-icod",
                        help="Model variant to run.")
    parser.add_argument(
        "--dataset",
        type=str,
        default="/data/nv419/VQG_DATA/processed/iq_dataset.hdf5")
    parser.add_argument(
        "--val_dataset",
        type=str,
        default="/data/nv419/VQG_DATA/processed/iq_val_dataset.hdf5")

    args = parser.parse_args()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    args.device = device

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
    )

    data_loader = get_loader(os.path.join(os.getcwd(), args.dataset),
                             tokenizer,
                             args.batch_size,
                             shuffle=True,
                             num_workers=8)
    val_data_loader = get_loader(os.path.join(os.getcwd(), args.val_dataset),
                                 tokenizer,
                                 args.batch_size,
                                 shuffle=False,
                                 num_workers=8)
Exemple #12
0
def get_bert_vocab_size(vocab_path: str) -> int:
    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    return tokenizer.vocab_size
Exemple #13
0
import tempfile
from argparse import ArgumentParser

import sentencepiece as spm
from transformers.models.bert.tokenization_bert import BertTokenizer

# Load the text tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

BOS_TOKEN = tokenizer.cls_token
EOS_TOKEN = tokenizer.sep_token
UNK_TOKEN = tokenizer.unk_token
PAD_ID = tokenizer.pad_token_id
BOS_ID = tokenizer.cls_token_id
EOS_ID = tokenizer.sep_token_id
UNK_ID = tokenizer.unk_token_id

# Build the condition (attribute) tokenizer
if __name__ == "__main__":
    parser = ArgumentParser()
    # fmt: off
    parser.add_argument("--input", nargs="+", required=True)
    parser.add_argument("--vocab-file", type=str, required=True)
    parser.add_argument("--vocab-size", type=int, default=31)
    parser.add_argument("--algo",
                        type=str,
                        default="bpe",
                        choices=["bpe", "word"])
    # fmt: on
    args = parser.parse_args()
    print("Building token vocabulary")
Exemple #14
0
def tokenizer():
    return BertTokenizer.from_pretrained('bert-base-cased')
def main():
    parser = argparse.ArgumentParser(description='seq2seq')

    parser.add_argument(
        '--model',
        default='seq2seq',
        type=str,
        help=
        'which model you are going to train, now including [seq2seq, pmi_seq2seq]'
    )
    parser.add_argument(
        '--attn',
        default=None,
        type=str,
        help='which attention method to use, including [dot, general, concat]')
    parser.add_argument('--gpu',
                        default=-1,
                        type=int,
                        help='which GPU to use, -1 means using CPU')
    parser.add_argument('--save',
                        action="store_true",
                        help='whether to save model or not')
    parser.add_argument('--bs', default=64, type=int, help='batch size')
    parser.add_argument('--emb_dim',
                        default=300,
                        type=int,
                        help='embedding dim')
    parser.add_argument('--enc_hid_dim',
                        default=300,
                        type=int,
                        help='hidden dim of lstm')
    parser.add_argument('--dec_hid_dim',
                        default=300,
                        type=int,
                        help='hidden dim of lstm')
    parser.add_argument('--birnn',
                        action='store_true',
                        help='whether to use bidirectional rnn, default False')
    parser.add_argument('--n_layers',
                        default=1,
                        type=int,
                        help='layer num of encoder and decoder')
    parser.add_argument('--dropout',
                        default=0.5,
                        type=float,
                        help='dropout ratio')
    parser.add_argument('--n_epochs',
                        default=30,
                        type=int,
                        help='num of train epoch')
    parser.add_argument('--min_freq',
                        default=1,
                        type=int,
                        help='minimal occur times for vocabulary')
    parser.add_argument('--clip', default=None, type=float, help='grad clip')
    parser.add_argument('--maxlen',
                        default=None,
                        type=int,
                        help='max length of text')
    parser.add_argument('--dataset_dir_path',
                        default=None,
                        type=str,
                        help='path to directory where data file is saved')
    parser.add_argument('--tokenizer',
                        default='spacy_en',
                        type=str,
                        help='which tokenizer to use for the dataset')
    parser.add_argument('--train_file',
                        default=None,
                        type=str,
                        help='train file name')
    parser.add_argument('--valid_file',
                        default=None,
                        type=str,
                        help='valid file name')
    parser.add_argument('--test_file',
                        default=None,
                        type=str,
                        help='test file name')
    parser.add_argument('--save_dir',
                        default='models',
                        type=str,
                        help='save dir')
    parser.add_argument('--vocab_file',
                        default=None,
                        type=str,
                        help='predefined vocab file')
    parser.add_argument(
        '--num_workers',
        default=0,
        type=int,
        help=
        'how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process.'
    )
    parser.add_argument('--l2',
                        default=0,
                        type=float,
                        help='l2 regularization')
    parser.add_argument('--lr', default=1e-4, type=float, help='learning rate')
    parser.add_argument(
        '--teaching_rate',
        default=1,
        type=float,
        help='teaching_rate is probability to use teacher forcing')
    parser.add_argument('--pretrained_embed_file',
                        default=None,
                        type=str,
                        help='torchtext vector name')
    parser.add_argument('--warmup',
                        default=0,
                        type=int,
                        help='warmup steps, 0 means not using NoamOpt')
    parser.add_argument('--cell_type',
                        default='LSTM',
                        type=str,
                        help='cell type of encoder/decoder, LSTM or GRU')
    parser.add_argument(
        '--comment',
        default='',
        type=str,
        help='comment, will be used as prefix of save directory')
    parser.add_argument('--smoothing',
                        default=0.0,
                        type=float,
                        help='smoothing rate of computing kl div loss')
    parser.add_argument('--max_vocab_size',
                        default=None,
                        type=int,
                        help='max size of vocab')
    parser.add_argument('--serialize',
                        action='store_true',
                        help='whether to serialize examples and vocab')
    parser.add_argument('--use_serialized',
                        action='store_true',
                        help='whether to use serialized dataset')
    parser.add_argument('--model_path',
                        default=None,
                        type=str,
                        help='restore model to continue training')
    parser.add_argument('--global_step',
                        default=0,
                        type=int,
                        help='global step for continuing training')
    parser.add_argument('--inference',
                        action='store_true',
                        help='inference mode')
    parser.add_argument('--seed',
                        default=20020206,
                        type=int,
                        help='random seed')
    parser.add_argument(
        '--ln',
        action='store_true',
        help=
        'whether to use layernorm, if model is pmi_seq2seq, use conditional layernorm as default'
    )
    parser.add_argument(
        '--patience',
        default=None,
        type=int,
        help=
        "stop when {patience} continued epochs giving  no improved performance"
    )

    args, unparsed = parser.parse_known_args()
    setup_random_seed(args.seed)
    writer = None
    if args.save:
        tz_sh = tz.gettz('Asia/Shanghai')
        save_dir = os.path.join(
            args.save_dir,
            args.comment + 'run' + str(datetime.now(tz=tz_sh)).replace(
                ":", "-").split(".")[0].replace(" ", '.'))
        if args.model_path:
            save_dir = os.path.split(args.model_path)[0]
        args.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        with open(os.path.join(save_dir, 'args.txt'), 'w') as f:
            json.dump(args.__dict__, f, indent=2)
        writer = SummaryWriter(os.path.join(save_dir, 'summary'))

    device = torch.device(args.gpu if (
        torch.cuda.is_available() and args.gpu >= 0) else 'cpu')
    args.device = device

    if args.tokenizer == 'spacy_en':
        dataset = seq2seq_dataset(args)
    elif args.tokenizer == 'jieba':
        from data.dataset import jieba_tokenize
        dataset = seq2seq_dataset(args, tokenizer=jieba_tokenize)
    elif args.tokenizer == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        dataset = seq2seq_dataset(args, tokenizer=tokenizer.tokenize)
    elif args.tokenizer == 'whitespace':
        from data.dataset import whitespace_tokenize
        dataset = seq2seq_dataset(args, tokenizer=whitespace_tokenize)

    #  dataset = load_iwslt(args)

    SRC = dataset['fields']['src']
    TGT = dataset['fields']['tgt']

    EMB_DIM = args.emb_dim
    ENC_HID_DIM = args.enc_hid_dim
    DEC_HID_DIM = args.dec_hid_dim
    N_LAYERS = args.n_layers
    ENC_DROPOUT = args.dropout
    DEC_DROPOUT = args.dropout
    SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
    TGT_PAD_IDX = TGT.vocab.stoi[TGT.pad_token]
    N_EPOCHS = args.n_epochs
    CLIP = args.clip

    src_embedding = Embedding(len(SRC.vocab),
                              EMB_DIM,
                              padding_idx=SRC_PAD_IDX,
                              dropout=ENC_DROPOUT)
    tgt_embedding = Embedding(len(TGT.vocab),
                              EMB_DIM,
                              padding_idx=TGT_PAD_IDX,
                              dropout=DEC_DROPOUT)
    if args.pretrained_embed_file:
        # 权重在词汇表vocab的vectors属性中
        src_pretrained_vectors = SRC.vocab.vectors
        tgt_pretrained_vectors = TGT.vocab.vectors
        # 指定嵌入矩阵的初始权重
        src_embedding.lut.weight.data.copy_(src_pretrained_vectors)
        tgt_embedding.lut.weight.data.copy_(tgt_pretrained_vectors)
        print("pretrained vectors loaded successfully!")

    enc = RNNBaseEncoder(args.cell_type,
                         EMB_DIM,
                         ENC_HID_DIM,
                         N_LAYERS,
                         bidirectional=args.birnn,
                         dropout=ENC_DROPOUT,
                         layernorm=args.ln)
    if args.attn is not None:
        dec = LuongAttnRNNDecoder(args.cell_type,
                                  EMB_DIM,
                                  ENC_HID_DIM,
                                  DEC_HID_DIM,
                                  attn_method=args.attn,
                                  num_layers=N_LAYERS,
                                  dropout=DEC_DROPOUT)
    else:
        dec = RNNBaseDecoder(args.cell_type,
                             EMB_DIM,
                             DEC_HID_DIM,
                             num_layers=N_LAYERS,
                             dropout=DEC_DROPOUT)

    generator = Generator(DEC_HID_DIM, len(TGT.vocab))
    if args.ln:
        if args.model == 'seq2seq':
            layernorm = LayerNorm(feature=ENC_HID_DIM)
        elif args.model == 'pmi_seq2seq':
            layernorm = LayerNorm(feature=DEC_HID_DIM,
                                  conditional=True,
                                  condition_size=len(TGT.vocab),
                                  condition_hidden_size=DEC_HID_DIM,
                                  condition_activation="ReLU")
        else:
            raise ValueError(args.model, "is not a legal model name!")
    else:
        layernorm = None

    if args.model == 'seq2seq':
        model = RNNBaseSeq2Seq(enc, dec, src_embedding, tgt_embedding,
                               generator).to(device)
        train_pmi = None
    elif args.model == 'pmi_seq2seq':
        # 默认pmi_hid_dim = ENC_HID_DIM, 因此dec_hid_dim必须是enc_hid_dim的两倍!
        model = RNNBasePMISeq2Seq(ENC_HID_DIM, enc, dec, src_embedding,
                                  tgt_embedding, generator,
                                  layernorm).to(device)
        from scipy import sparse
        train_pmi = sparse.load_npz(
            os.path.join(args.dataset_dir_path, "train_sparse_pmi_matrix.npz"))
        #  valid_pmi = sparse.load_npz(os.path.join(args.dataset_dir_path, "valid_sparse_pmi_matrix.npz")) # 好像用不上valid和test pmi,不然算标签泄漏了?
        #  test_pmi = sparse.load_npz(os.path.join(args.dataset_dir_path, "test_sparse_pmi_matrix.npz"))

    if args.model_path is not None:
        logger.info(f"Restore model from {args.model_path}...")
        #  model.load_state_dict(torch.load(args.model_path, map_location={'cuda:0': 'cuda:' + str(args.gpu)}))
        model = torch.load(args.model_path,
                           map_location={'cuda:0': 'cuda:' + str(args.gpu)})
        model.to(args.device)

    print(model)
    weight = torch.ones(len(TGT.vocab), device=args.device)
    weight[TGT_PAD_IDX] = 0
    criterion = nn.NLLLoss(reduction='sum',
                           ignore_index=TGT_PAD_IDX,
                           weight=weight)
    #  criterion = LabelSmoothing(args, len(TGT.vocab), padding_idx=TGT_PAD_IDX, smoothing=args.smoothing)

    if args.inference:
        try:
            assert args.model_path is not None
        except AssertionError:
            logger.error(
                "If you want to do inference, you must offer a trained model's path!"
            )
        finally:
            inference(args,
                      model,
                      dataset['valid_iterator'],
                      fields=dataset['fields'],
                      mode='valid',
                      pmi=train_pmi)
            inference(args,
                      model,
                      dataset['test_iterator'],
                      fields=dataset['fields'],
                      mode='test',
                      pmi=train_pmi)
            return 0

    print(f'The model has {count_parameters(model):,} trainable parameters')
    optimizer = AdamOptimizer(model.parameters(),
                              lr=args.lr,
                              weight_decay=args.l2,
                              max_grad_norm=args.clip)
    #  optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2)
    if args.warmup > 0:
        optimizer = NoamOptimWrapper(args.hid_dim, 1, args.warmup, optimizer)
    if args.global_step > 0:
        logger.info(f'Global step start from {args.global_step}')
        optimizer._step = args.global_step

    # TODO 取消hard-code式保存最佳指标
    best_global_step = 0
    best_valid_loss = float('inf')
    best_test_loss = float('inf')
    global_step = optimizer._step
    patience = args.patience if args.patience else float('inf')
    no_improve = 0

    for epoch in range(N_EPOCHS):
        start_time = time.time()

        train_metrics = train(args,
                              model,
                              dataset['train_iterator'],
                              optimizer,
                              criterion,
                              fields=dataset['fields'],
                              writer=writer,
                              pmi=train_pmi)
        global_step += len(dataset['train_iterator'])
        args.global_step = global_step
        valid_metrics = evaluate(args,
                                 model,
                                 dataset['valid_iterator'],
                                 criterion,
                                 fields=dataset['fields'],
                                 pmi=train_pmi)
        test_metrics = evaluate(args,
                                model,
                                dataset['test_iterator'],
                                criterion,
                                fields=dataset['fields'],
                                pmi=train_pmi)
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        print(
            f'Epoch: {epoch + 1:02} | Global step: {global_step} | Time: {epoch_mins}m {epoch_secs}s'
        )
        for metrics, mode in zip([train_metrics, valid_metrics, test_metrics],
                                 ['Train', 'Valid', 'Test']):
            print_metrics(metrics, mode=mode)

        # TODO 优化存储logfile,取消hard-code模式
        if args.save:
            write_metrics_to_writer(valid_metrics,
                                    writer,
                                    global_step,
                                    mode='Valid')
            write_metrics_to_writer(test_metrics,
                                    writer,
                                    global_step,
                                    mode='Test')
            best_valid_loss = valid_metrics['epoch_loss'] if valid_metrics[
                'epoch_loss'] < best_valid_loss else best_valid_loss
            best_test_loss = test_metrics['epoch_loss'] if test_metrics[
                'epoch_loss'] < best_test_loss else best_test_loss
            best_global_step = global_step if valid_metrics[
                'epoch_loss'] == best_valid_loss else best_global_step

            if best_global_step == global_step:
                torch.save(
                    model,
                    os.path.join(save_dir,
                                 f'model_global_step-{global_step}.pt'))
                #  torch.save(model.state_dict(), os.path.join(save_dir, f'model_global_step-{global_step}.pt'))
                no_improve = 0
            else:
                no_improve += 1
            with open(
                    os.path.join(save_dir,
                                 f'log_global_step-{global_step}.txt'),
                    'w') as log_file:
                valid_metrics['Best Global Step'] = best_global_step
                valid_metrics['Best Loss'] = best_valid_loss

                test_metrics['Best Loss'] = best_test_loss
                test_metrics['Best PPL'] = math.exp(best_test_loss)

                inference(args,
                          model,
                          dataset['valid_iterator'],
                          fields=dataset['fields'],
                          mode='valid',
                          pmi=train_pmi)
                inference(args,
                          model,
                          dataset['test_iterator'],
                          fields=dataset['fields'],
                          mode='test',
                          pmi=train_pmi)

                valid_path_hyp = os.path.join(args.save_dir,
                                              'responses-valid.txt')
                test_path_hyp = os.path.join(args.save_dir,
                                             'responses-test.txt')
                valid_path_ref = os.path.join(args.save_dir,
                                              'answers-valid.txt')
                test_path_ref = os.path.join(args.save_dir, 'answers-test.txt')

                other_valid_metrics = calc_metrics(path_refs=valid_path_ref,
                                                   path_hyp=valid_path_hyp)
                other_test_metrics = calc_metrics(path_refs=test_path_ref,
                                                  path_hyp=test_path_hyp)
                valid_metrics.update(other_valid_metrics)
                test_metrics.update(other_test_metrics)

                os.remove(os.path.join(args.save_dir, 'posts-valid.txt'))
                os.remove(os.path.join(args.save_dir, 'posts-test.txt'))
                os.remove(valid_path_hyp)
                os.remove(valid_path_ref)
                os.remove(test_path_hyp)
                os.remove(test_path_ref)

                for metric, performance in valid_metrics.items():
                    log_file.write(f'Valid {metric}: {performance}\n')
                for metric, performance in test_metrics.items():
                    log_file.write(f'Test {metric}: {performance}\n')
            if no_improve >= patience:
                break
Exemple #16
0
    if len(examples) > 10000:
        # save to shards for training data
        shard_size = (len(examples) + 7) // 8
        for shard_id in range(8):
            write_to_tar(
                f"{output}-{shard_id}.tar",
                examples[shard_id * shard_size:(shard_id + 1) * shard_size],
            )
    else:
        write_to_tar(f"{output}.tar", examples)


if __name__ == "__main__":
    parser = ArgumentParser()
    add_options(parser)
    args = parser.parse_args()
    prepare_file(args)
    np.random.seed(42)

    text_tokenizer = BertTokenizer.from_pretrained(args.vocab_file)
    cond_tokenizer = spm.SentencePieceProcessor()
    cond_tokenizer.Load(args.cond_vocab_file)

    for split in args.split:
        preprocess_raw(
            input_prefix=os.path.join(args.raw_path, split),
            output=os.path.join(args.processed_path, f"{split}"),
            text_tokenizer=text_tokenizer,
            cond_tokenizer=cond_tokenizer,
        )