def main():
    args = parse_arguments()

    words = collect_words(args.input_dir)

    mlbt = BertTokenizer(args.model_dir, do_lower_case=False)
    hubt = BertTokenizer(args.vocab_file, do_lower_case=False)

    mlstats = count_wordpieces(words, mlbt)
    hustats = count_wordpieces(words, hubt)

    print(f'Multilingual: {mlstats}')
    print(f'Hungarian: {hustats}')
Exemple #2
0
    def test_data_collator_for_language_modeling(self):
        tokenizer = BertTokenizer(self.vocab_file)
        no_pad_features = [{
            "input_ids": list(range(10))
        }, {
            "input_ids": list(range(10))
        }]
        pad_features = [{
            "input_ids": list(range(5))
        }, {
            "input_ids": list(range(10))
        }]

        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        batch = data_collator(no_pad_features)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

        batch = data_collator(pad_features)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

        tokenizer._pad_token = None
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        with self.assertRaises(ValueError):
            # Expect error due to padding token missing
            data_collator(pad_features)

        set_seed(42)  # For reproducibility
        tokenizer = BertTokenizer(self.vocab_file)
        data_collator = DataCollatorForLanguageModeling(tokenizer)
        batch = data_collator(no_pad_features)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
        self.assertTrue(torch.any(masked_tokens))
        self.assertTrue(
            all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))

        batch = data_collator(pad_features)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
        self.assertTrue(torch.any(masked_tokens))
        self.assertTrue(
            all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
Exemple #3
0
def collate_fn(batch_data):
    tokenizer = BertTokenizer('./data/bert/nezha-base-www/vocab.txt')
    max_len = max([len(x[0]) for x in batch_data]) + 2
    input_ids, token_type_ids, attention_mask, labels = [], [], [], []
    for text, label in batch_data:
        inputs = tokenizer.encode_plus(text=text,
                                       max_length=max_len,
                                       pad_to_max_length=True,
                                       is_pretokenized=True,
                                       return_token_type_ids=True,
                                       return_attention_mask=True,
                                       truncation=True)
        label = tokenizer.encode_plus(text=label,
                                      max_length=max_len,
                                      pad_to_max_length=True,
                                      is_pretokenized=True,
                                      return_token_type_ids=False,
                                      return_attention_mask=False,
                                      truncation=True)
        input_ids.append(inputs['input_ids'])
        token_type_ids.append(inputs['token_type_ids'])
        attention_mask.append(inputs['attention_mask'])
        labels.append(label['input_ids'])
    input_ids = torch.tensor(input_ids).long()
    token_type_ids = torch.tensor(token_type_ids).long()
    attention_mask = torch.tensor(attention_mask).float()
    labels = torch.tensor(labels).long()
    return input_ids, token_type_ids, attention_mask, labels
Exemple #4
0
def main():
    config = get_config()
    args = train_args.setup_train_args()
    if args.seed:
        train_args.set_random_seed(args)
    # 初始化tokenizer
    tokenizer = BertTokenizer(vocab_file=args.vocab_path)
    # tokenizer的字典大小
    global pad_id
    # pad_id = tokenizer.convert_tokens_to_ids(PAD)

    # 创建对话模型的输出目录
    if not os.path.exists(args.dialogue_model_output_path):
        os.mkdir(args.dialogue_model_output_path)

    # 加载GPT2模型
    model, n_ctx, optimizer = create_model(args, config)

    # 对原始数据进行预处理,将原始语料转换成对应的token_id
    # 如果当前是要训练对话生成模型
    print('开始产生token')
    # 不修改数据集的情况下,没必要每次训练都运行preprocess_raw_data 因为 生成的data是一样的
    if not os.path.exists(args.train_tokenized_path):
        file = open(args.train_tokenized_path, 'w')

    preprocess_data.preprocess_raw_data(args, tokenizer, n_ctx)
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')
    print('开始训练')
    train(model, args, tokenizer, optimizer, train_loss, train_accuracy)
    print('训练结束')
Exemple #5
0
def createVocabulary(reciepts):
  vocab = set()
  for reciept in reciepts:
    words = reciept.dataWords
    for word in words:
      vocab.add(word)
  path = './data/prod_vocab.txt'
  with open(path, 'r') as f:
    for line in f:
      vocab.add(line[:-1])
  tokenizer=BertTokenizer(vocab_file=path,do_lower_case=False)
  new_set = set()
  for word in vocab:
    token_list = tokenizer.tokenize(word)
    if '[UNK]' in token_list:
      print(word)
      t = re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', word)
      for i, v in enumerate(token_list):
        if v == '[UNK]' and i < len(t):
          for x in t:
            new_set.add(x)
  with open('./data/prod_vocab.txt', 'w+') as f:
    for word in (vocab.union(new_set)):
      f.write(word  + '\n')
  return vocab
def main():
    ## 外部參數設定
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_file_path', '-trfp', type=str)
    parser.add_argument('--test_file_path', '-tefp', type=str)
    parser.add_argument('--valid_file_path', '-vafp', type=str)
    parser.add_argument('--output_file_path', '-outfp', type=str)
    args = parser.parse_args()

    ## 預設值

    tokenizer = BertTokenizer(vocab_file='bert-base-uncased-vocab.txt')
    nlp = spacy.load(
        "model/spacy/en_core_web_md-2.3.1/en_core_web_md/en_core_web_md-2.3.1")
    accepted_pos_list = get_accepted_pos_list()
    ##

    ## for debug
    # args.train_file_path="/user_data/Project/Controllable_Syntax_with_BERT/dataset/train_10.txt"
    # args.test_file_path="/user_data/Project/Controllable_Syntax_with_BERT/dataset/test_5.txt"
    # args.valid_file_path="/user_data/Project/Controllable_Syntax_with_BERT/dataset/validation_5.txt"
    # args.output_file_path="/user_data/Project/Controllable_Syntax_with_BERT/sequential_dataset"
    ##

    data_path_dict = {
        "train": args.train_file_path,
        "test": args.test_file_path,
        "validation": args.valid_file_path
    }
    for key, data_path in data_path_dict.items():
        print("get {0} data ...".format(key))
        semantic_list, syntactic_list = get_dataset_list(data_path)

        print(" get {0} all syntactic keyword list ...".format(key))
        all_syntactic_keyword_list = get_all_syntactic_keyword_list(
            syntactic_list, accepted_pos_list, tokenizer, nlp)

        print(" insert sep to {0} all syntactic keyword list ...".format(key))
        all_syntactic_keyword_with_sep_list = insert_sep_token(
            all_syntactic_keyword_list)

        print(" get  {0} all sequence sentence list ...".format(key))
        all_sequence_sentence_list = get_all_sequence_sentence_list(
            syntactic_list, tokenizer)

        print(" get {0} embeddings ...".format(key))
        token_embedding_id_list, segment_embedding_list, attention_embedding_list, maskLM_embedding_list = get_embeddings(
            semantic_list, syntactic_list, all_syntactic_keyword_with_sep_list,
            all_sequence_sentence_list, tokenizer)

        print(" convert to feature {0} embeddings ...".format(key))
        convert_embedding_to_feature(args.output_file_path, key,
                                     token_embedding_id_list,
                                     segment_embedding_list,
                                     attention_embedding_list,
                                     maskLM_embedding_list)

        print(" {0} data finished".format(key))

    return 0
    def build_model(self):
        """创建GPT-2生成模型
        """
        # 使用bert tokenizer # 初始化tokenizer
        self.tokenizer = BertTokenizer(vocab_file=self.args.vocab_path)
        # temp = self.tokenizer.convert_tokens_to_ids('')
        # print(self.tokenizer.convert_ids_to_tokens(temp))
        # tokenizer的字典大小
        self.vocab_size = len(self.tokenizer)

        self.pad_id = self.tokenizer.convert_tokens_to_ids(PAD)

        if self.args.pretrained_model:
            # 如果指定了预训练的GPT2模型
            model = GPT2LMHeadModel.from_pretrained(self.args.pretrained_model)
        else:
            # 若没有指定预训练模型,则初始化模型
            model_config = GPT2Config(self.args.model_config)
            model = GPT2LMHeadModel(config=model_config)

        # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
        model.resize_token_embeddings(self.vocab_size)

        print('model config:\n{}'.format(model.config.to_json_string()))

        return model, model.config.to_dict().get("n_ctx")
Exemple #8
0
    def preprocess(self, data):
        """
        Receives text in form of json and converts it into an encoding for the inference stage

        :param data: Input to be passed through the layers for prediction

        :return: output - preprocessed encoding
        """

        text = data[0].get("data")
        if text is None:
            text = data[0].get("body")

        text = text.decode("utf-8")

        tokenizer = BertTokenizer(
            self.VOCAB_FILE)  # .from_pretrained("bert-base-cased")
        encoding = tokenizer.encode_plus(
            text,
            max_length=32,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",  # Return PyTorch tensors
            truncation=True,
        )

        return encoding
def chat(folder_bert, voc, testing=False):
    tf.random.set_seed(1)
    tokenizer = BertTokenizer(vocab_file=folder_bert + voc)
    if testing:
        tokens = tokenizer.tokenize("jeg tror det skal regne")
        print(tokens)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        print(ids)
        print("Vocab size:", len(tokenizer.vocab))

    config = BertConfig.from_json_file(folder_bert + "/config.json")
    model = BertLMHeadModel.from_pretrained(folder_bert, config=config)
    while (1):
        text = input(">>User: "******"Bot: {}".format(tokenizer.decode(sample_output[0])))
        print("Bot: {}".format(
            tokenizer.decode(sample_output[:, input_ids.shape[-1]:][0],
                             skip_special_tokens=True)))
 def __init__(self,
              pretrained_model: str,
              use_starting_offsets: bool = False,
              do_lowercase: bool = True,
              never_lowercase: List[str] = None,
              max_pieces: int = 512,
              truncate_long_sequences: bool = True) -> None:
     if pretrained_model.endswith("-cased") and do_lowercase:
         logger.warning("Your BERT model appears to be cased, "
                        "but your indexer is lowercasing tokens.")
     elif pretrained_model.endswith("-uncased") and not do_lowercase:
         logger.warning("Your BERT model appears to be uncased, "
                        "but your indexer is not lowercasing tokens.")
     if os.path.isdir(pretrained_model):
         pretrained_model = os.path.join(pretrained_model, 'vocab.txt')
     bert_tokenizer = BertTokenizer(pretrained_model,
                                    do_lower_case=do_lowercase)
     super().__init__(
         vocab=bert_tokenizer.vocab,
         wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize,
         namespace="bert",
         use_starting_offsets=use_starting_offsets,
         max_pieces=max_pieces,
         do_lowercase=do_lowercase,
         never_lowercase=never_lowercase,
         start_tokens=["[CLS]"],
         end_tokens=["[SEP]"],
         separator_token="[SEP]",
         truncate_long_sequences=truncate_long_sequences)
Exemple #11
0
    def test_sop(self):
        tokenizer = BertTokenizer(self.vocab_file)
        features = [{
            "input_ids": torch.tensor([0, 1, 2, 3, 4]),
            "token_type_ids": torch.tensor([0, 1, 2, 3, 4]),
            "sentence_order_label": i,
        } for i in range(2)]
        data_collator = DataCollatorForLanguageModeling(tokenizer)
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["sentence_order_label"].shape, torch.Size(
            (2, )))

        data_collator = DataCollatorForLanguageModeling(tokenizer,
                                                        pad_to_multiple_of=8)
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
        self.assertEqual(batch["sentence_order_label"].shape, torch.Size(
            (2, )))
    def test_data_collator_with_padding(self):
        tokenizer = BertTokenizer(self.vocab_file)
        features = [{
            "input_ids": [0, 1, 2]
        }, {
            "input_ids": [0, 1, 2, 3, 4, 5]
        }]

        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="np")
        batch = data_collator(features)
        self.assertEqual(batch["input_ids"].shape, (2, 6))
        self.assertEqual(batch["input_ids"][0].tolist(),
                         [0, 1, 2] + [tokenizer.pad_token_id] * 3)

        data_collator = DataCollatorWithPadding(tokenizer,
                                                padding="max_length",
                                                max_length=10,
                                                return_tensors="np")
        batch = data_collator(features)
        self.assertEqual(batch["input_ids"].shape, (2, 10))

        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8,
                                                return_tensors="np")
        batch = data_collator(features)
        self.assertEqual(batch["input_ids"].shape, (2, 8))
    def test_sop(self):
        tokenizer = BertTokenizer(self.vocab_file)
        features = [{
            "input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
            "token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
            "sentence_order_label": i,
        } for i in range(2)]
        data_collator = DataCollatorForLanguageModeling(tokenizer,
                                                        return_tensors="tf")
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5])
        self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5])
        self.assertEqual(batch["labels"].shape.as_list(), [2, 5])
        self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])

        data_collator = DataCollatorForLanguageModeling(tokenizer,
                                                        pad_to_multiple_of=8,
                                                        return_tensors="tf")
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
        self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8])
        self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
        self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])
    def test_nsp(self):
        tokenizer = BertTokenizer(self.vocab_file)
        features = [{
            "input_ids": [0, 1, 2, 3, 4],
            "token_type_ids": [0, 1, 2, 3, 4],
            "next_sentence_label": i
        } for i in range(2)]
        data_collator = DataCollatorForLanguageModeling(tokenizer,
                                                        return_tensors="np")
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape, (2, 5))
        self.assertEqual(batch["token_type_ids"].shape, (2, 5))
        self.assertEqual(batch["labels"].shape, (2, 5))
        self.assertEqual(batch["next_sentence_label"].shape, (2, ))

        data_collator = DataCollatorForLanguageModeling(tokenizer,
                                                        pad_to_multiple_of=8,
                                                        return_tensors="np")
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape, (2, 8))
        self.assertEqual(batch["token_type_ids"].shape, (2, 8))
        self.assertEqual(batch["labels"].shape, (2, 8))
        self.assertEqual(batch["next_sentence_label"].shape, (2, ))
Exemple #15
0
def convert_data_to_context(filepath, dataset):
    DRCD = LoadJson(filepath)
    tokenizer = BertTokenizer(vocab_file='bert-base-uncased-vocab.txt')

    # context_tokens = []
    # context_loss_tokens = []
    sample = []
    keyword_tokens = []

    # BertForMaskedLM
    for data in DRCD["data"]:
        for paragraph in data["paragraphs"]:
            context = paragraph["context"]
            little_context = context[:128]
            sample.append(little_context)

    index = round(len(sample) * 0.25)
    if dataset == "test1":
        small_sample = sample[:index]
    elif dataset == "test2":
        small_sample = sample[index:index * 2]
    elif dataset == "test3":
        small_sample = sample[index * 2:index * 3]
    else:
        small_sample = sample[index * 3:]

    for c in small_sample:
        # c_c = conversion_context(c, tokenizer, context_loss_tokens)
        k = context_keyword(c)
        # context_tokens.append(c_c)
        keyword_tokens.append(k[0])

    # return context_tokens

    return keyword_tokens
    def explain_handle(self, model_wraper, text, target=1):  # pylint: disable=too-many-locals,unused-argument,arguments-differ
        """Captum explanations handler.

        Args:
            data_preprocess (Torch Tensor): Preprocessed data to be used for captum
            raw_data (list): The unprocessed data to get target from the request
        Returns:
            dict : A dictionary response with the explanations response.
        """
        model_wrapper = AGNewsmodelWrapper(self.model)
        tokenizer = BertTokenizer(self.vocab_file)
        model_wrapper.eval()
        model_wrapper.zero_grad()
        input_ids = torch.tensor(
            [tokenizer.encode(self.text, add_special_tokens=True)])
        input_embedding_test = model_wrapper.model.bert_model.embeddings(
            input_ids)
        preds = model_wrapper(input_embedding_test)
        out = np.argmax(preds.cpu().detach(), axis=1)
        out = out.item()
        ig_1 = IntegratedGradients(model_wrapper)
        attributions, delta = ig_1.attribute(  # pylint: disable=no-member
            input_embedding_test,
            n_steps=500,
            return_convergence_delta=True,
            target=1,
        )
        tokens = tokenizer.convert_ids_to_tokens(input_ids[0].numpy().tolist())
        feature_imp_dict = {}
        feature_imp_dict["words"] = tokens
        attributions_sum = self.summarize_attributions(attributions)
        feature_imp_dict["importances"] = attributions_sum.tolist()
        feature_imp_dict["delta"] = delta[0].tolist()
        return [feature_imp_dict]
Exemple #17
0
 def featurize(self, df):
     bert_model = BertModel.from_pretrained(self.data_path)
     bert_tokenizer = BertTokenizer(self.data_path + "/vocab.txt",
                                    do_lower_case=False,
                                    do_basic_tokenize=False)
     mecab = MeCab.Tagger('-Ochasen')
     data_list = df.rdd.collect()
     label_list = []
     vec_list = []
     for data in data_list:
         tmp_list = []
         node_list = data[1]
         for word in node_list:
             tmp_list.append(word)
         if len(tmp_list) != 0:
             label_list.append(float(data[0]))
             bert_tokens = bert_tokenizer.tokenize(
                 " ".join(["[CLS]"] + tmp_list + ["[SEP]"]))
             token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens)
             tokens_tensor = torch.tensor(token_ids).unsqueeze(0)
             all_outputs = bert_model(tokens_tensor)
             embedding = all_outputs[-2].detach().numpy()[0]
             vec = np.mean(embedding, axis=0).tolist()
             vec_list.append(Vectors.dense(vec))
     zip_list = zip(label_list, vec_list)
     new_df = self.spark.createDataFrame(zip_list, ("label", "features"))
     return new_df
def tokenize_and_pad_samples(genes, labels):
    k = len(genes[0][0])
    if k == 4:
        kmer_filepath = '/home/brian/Downloads/fourmers.txt'
    elif k == 6:
        kmer_filepath = '/home/brian/Downloads/hexamers.txt'
    elif k == 8:
        kmer_filepath = '/home/brian/Downloads/octamers.txt'
    formatted_samples = [['[CLS]'] + sample + ['[SEP]'] for sample in genes]
    formatted_labels = [[0] + l + [0] for l in labels]
    tokenizer = BertTokenizer(kmer_filepath, max_len=MAX_LEN)
    print("TOKENIZER LENGTH", len(tokenizer))
    attention_masks = [
        np.concatenate([np.ones(len(l)),
                        np.zeros(MAX_LEN - len(l))]) for l in formatted_labels
    ]
    #seq_ids = tokenizer.convert_tokens_to_ids(formatted_samples)
    seq_ids = [
        tokenizer.convert_tokens_to_ids(sample) for sample in formatted_samples
    ]
    seq_ids = pad_sequences(seq_ids,
                            maxlen=MAX_LEN,
                            truncating='post',
                            padding='post')

    return seq_ids, attention_masks, formatted_labels
    def __init__(self, context: PyTorchTrialContext) -> None:
        # Read configuration
        self.context = context
        self.data_config = self.context.get_data_config()

        # Create Tensorboard logger
        self.logger = TorchWriter()

        # Create tokinizer based on the predefient vocabulary
        self.tokenizer = BertTokenizer(self.data_config["voc_path"], do_lower_case=False)

        # Label Encoder
        if self.context.get_hparam("reduce_to_binary_problem"):
            class_num = 2
        else:
            class_num = 6

        # Initialize model and wrap it in the determined api
        model = ProtTransClassification(self.data_config["pretrained_path"],
                                        class_num=class_num,
                                        classification_feature=self.context.get_hparam("classification_feature"),
                                        dropout=self.context.get_hparam("classification_dropout"),
                                        freeze_bert=self.context.get_hparam("bert_freeze"))

        optimizer = Lamb([{"params": model.wordencoding.parameters(), "lr": self.context.get_hparam("bert_lr")},
                          {"params": model.classification.parameters()}], lr=self.context.get_hparam("classification_lr"))

        self.model = self.context.wrap_model(model)
        self.optimizer = self.context.wrap_optimizer(optimizer)
Exemple #20
0
    def __init__(self,
                 squad_model_config: str,
                 vocab_file: str,
                 do_lower_case: bool,
                 max_seq_length: int = 512,
                 batch_size: int = 10,
                 lang: str = 'en',
                 **kwargs) -> None:
        config = json.load(open(squad_model_config))
        config['chainer']['pipe'][0]['max_seq_length'] = max_seq_length
        self.model = build_model(config)
        self.max_seq_length = max_seq_length

        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = BertTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = BertTokenizer.from_pretrained(
                vocab_file, do_lower_case=do_lower_case)

        self.batch_size = batch_size

        if lang == 'en':
            from nltk import sent_tokenize
            self.sent_tokenizer = sent_tokenize
        elif lang == 'ru':
            from ru_sent_tokenize import ru_sent_tokenize
            self.sent_tokenizer = ru_sent_tokenize
        else:
            raise RuntimeError('en and ru languages are supported only')
Exemple #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_file_path', '-trfp', type=str)
    parser.add_argument('--test_file_path', '-tefp', type=str)
    args = parser.parse_args()

    ## for debug
    if args.train_file_path == None:
        args.train_file_path = "dataset/mingda_train_10.txt"
    if args.test_file_path == None:
        args.test_file_path = "dataset/mingda_test_5.txt"
    ##

    ## pre-load
    tokenizer = BertTokenizer(vocab_file='bert-base-uncased-vocab.txt')
    nlp = spacy.load(
        "model/spacy/en_core_web_md-2.3.1/en_core_web_md/en_core_web_md-2.3.1")
    accepted_pos_list = get_accepted_pos_list()
    ##
    data_path_dict = {
        "train": args.train_file_path,
        "test": args.test_file_path
    }
    for key, data_path in data_path_dict.items():
        print("get {0} data ...".format(key))
        semantics_list, syntactic_list = get_dataset_list(data_path)
        print("get {0} all syntactic keyword list ...".format(key))
        extrapolate_sequential_syntactic(syntactic_list, accepted_pos_list,
                                         tokenizer, nlp)
        print("")
Exemple #22
0
def bert_pretraining(dataset, config):
    bert_tokenizer = BertTokenizer('./bert-base-chinese' + '/vocab.txt')
    model = BertModel.from_pretrained('./bert-base-chinese')
    model.eval()
    model.to(config.device)

    for batch in batch_slice(dataset, config.train_batch_size):
        tokens_tensor = []

        for instance in batch:
            instance.ids = bert_tokenizer.convert_tokens_to_ids(instance.chars)
            tokens_tensor.append(torch.tensor(instance.ids))

        tokens_tensor = pad_sequence(tokens_tensor).T
        attention_mask = torch.ne(tokens_tensor,
                                  torch.zeros_like(tokens_tensor))

        tokens_tensor = tokens_tensor.to(config.device)
        attention_mask = attention_mask.to(config.device)

        with torch.no_grad():
            outputs = model(tokens_tensor, attention_mask=attention_mask)
            encoded_layers = outputs[0]

        for index, instance in enumerate(batch):
            instance.embeddings = encoded_layers[
                index, 0:len(instance.ids), :].cpu().numpy()
Exemple #23
0
    def __init__(self, model_file=DEFAULT_MODEL_URL, name="Dialog"):
        super(Dialog, self).__init__(name=name)
        if not os.path.exists(os.path.join(DEFAULT_DIRECTORY,'multiwoz/data')):
            os.mkdir(os.path.join(DEFAULT_DIRECTORY,'multiwoz/data'))
            ### download multiwoz data
            print('down load data from', DEFAULT_ARCHIVE_FILE_URL)
        if not os.path.exists(os.path.join(DEFAULT_DIRECTORY,'multiwoz/save')):
            os.mkdir(os.path.join(DEFAULT_DIRECTORY,'multiwoz/save'))
            ### download trained model
            print('down load model from', DEFAULT_MODEL_URL)
        model_path = ""

        config = Config()
        parser = config.parser
        config = parser.parse_args()
        with open("assets/never_split.txt") as f:
            never_split = f.read().split("\n")
        self.tokenizer = BertTokenizer("assets/vocab.txt", never_split=never_split)
        self.nlu = BERTNLU()
        self.dst_ = DST(config).cuda()
        ckpt = torch.load("save/model_Sun_Jun_21_07:08:48_2020.pt", map_location = lambda storage, loc: storage.cuda(local_rank))
        self.dst_.load_state_dict(ckpt["model"])
        self.dst_.eval()
        self.policy = RulePolicy()
        self.nlg = TemplateNLG(is_user=False)
        self.init_session()
        self.slot_mapping = {
            "leave": "leaveAt",
            "arrive": "arriveBy"
        }
Exemple #24
0
def create_Bert_tokenizer(use_pretrained=True, **kwargs):
    if use_pretrained:
        if 'model' not in kwargs:
            raise ValueError("Need type to sign the pretrained model")
        Path = ModelConfig.Bert_Pretrained_Model_Map.get(kwargs['model'], None)
        tokenizer_path = os.path.join(Path, 'vocab.txt')
        if Path == None:
            raise ValueError("Need choice model again")
        Tokenizer = BertTokenizer(tokenizer_path)
    else:
        if 'vocab_file' not in kwargs:
            raise ValueError("Please input vocab file path")
        path = kwargs.get('vocab_file')
        Tokenizer = BertTokenizer(path)

    return Tokenizer
Exemple #25
0
    def build_data(self):
        self.tokenizer = BertTokenizer(vocab_file=self.args.vocab_path)
        self.vocab_size = len(self.tokenizer)
        self.pad_id = self.tokenizer.convert_tokens_to_ids('[PAD]')

        # 对原始数据进行预处理,将原始语料转换成对应的token_id
        if self.args.raw:
            for subset in ['train', 'valid', 'test']:
                self.preprocess_raw_data(subset)
        # 加载tokenized data
        self.subset2data = {}
        with open(self.args.test_tokenized_path, "r", encoding="utf8") as f:
            self.subset2data['test'] = f.read()
        if not self.args.do_eval:
            with open(self.args.train_tokenized_path, "r",
                      encoding="utf8") as f:
                self.subset2data['train'] = f.read()
            with open(self.args.valid_tokenized_path, "r",
                      encoding="utf8") as f:
                self.subset2data['valid'] = f.read()
        # 这一步是干啥的
        for subset in self.subset2data:
            self.subset2data[subset] = self.subset2data[subset].split("\n")

        self.logger.info("Train/Valid/Test set has {} convs".format(
            [len(self.subset2data[subset]) for subset in self.subset2data]))
Exemple #26
0
    def pre_proc(self, examples):
        self.max_seq_length = 384
        self.max_query_length = 64
        self.doc_stride = 128
        eval_features = []
        cache_path = 'eval_features.pickle'
        # Load features if cached, convert from examples otherwise.
        if os.path.exists(cache_path):
            log.info("Loading cached features from '%s'..." % cache_path)
            with open(cache_path, 'rb') as cache_file:
                eval_features = pickle.load(cache_file)
        else:
            log.info("Creating tokenizer...")
            tokenizer = BertTokenizer(self.vocab_path)

            log.info("Converting examples to features...")

            def append_feature(feature):
                eval_features.append(feature)

            convert_examples_to_features(
                examples=examples,
                tokenizer=tokenizer,
                max_seq_length=self.max_seq_length,
                doc_stride=self.doc_stride,
                max_query_length=self.max_query_length,
                is_training=False,
                output_fn=append_feature,
                verbose_logging=False)

            log.info("Caching features at '%s'..." % cache_path)
            with open(cache_path, 'wb') as cache_file:
                pickle.dump(eval_features, cache_file)
        print("len(eval_features)", len(eval_features))
        return eval_features
Exemple #27
0
    def test_plm(self):
        tokenizer = BertTokenizer(self.vocab_file)
        no_pad_features = [{
            "input_ids": list(range(10))
        }, {
            "input_ids": list(range(10))
        }]
        pad_features = [{
            "input_ids": list(range(5))
        }, {
            "input_ids": list(range(10))
        }]

        data_collator = DataCollatorForPermutationLanguageModeling(tokenizer)

        batch = data_collator(pad_features)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 10, 10)))
        self.assertEqual(batch["target_mapping"].shape, torch.Size(
            (2, 10, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

        batch = data_collator(no_pad_features)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 10, 10)))
        self.assertEqual(batch["target_mapping"].shape, torch.Size(
            (2, 10, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

        example = [torch.randint(5, [5])]
        with self.assertRaises(ValueError):
            # Expect error due to odd sequence length
            data_collator(example)
    def __init__(self, path, vocab_path):
        self.path = path
        self.csv = pd.read_csv(self.path, encoding="cp949")

        self.normalize()

        self.param = self.csv.keys()
        self.data = self.csv.to_numpy().astype(np.int32).astype(str)

        self.train_data = None
        self.val_data = None
        self.test_data = None

        self.train_labels = None
        self.val_labels = None
        self.test_labels = None

        self.split_data()

        self.tokenizer = BertTokenizer(vocab_file=vocab_path,
                                       tokenize_chinese_chars=False)
        self.train_encoding = self.tokenizer(*get_token_param(self.train_data),
                                             return_tensors="pt")
        self.val_encoding = self.tokenizer(*get_token_param(self.val_data),
                                           return_tensors="pt")
        self.test_encoding = self.tokenizer(self.test_data[:, :-7].tolist(),
                                            self.test_data[:, -7:].tolist(),
                                            return_tensors="pt")

        self.train_dataset = FoodDataset(self.train_encoding,
                                         self.train_labels)
        self.vat_dataset = FoodDataset(self.val_encoding, self.val_labels)
        self.test_dataset = FoodDataset(self.test_encoding, self.test_labels)
Exemple #29
0
 def __init__(self,
              image_root: str,
              scibert_path: str,
              lazy: bool = False,
              limit: int = None,
              max_sequence_length: int = 512,
              different_type_for_refs: bool = True,
              use_refs: bool = True):
     super().__init__(lazy)
     self.image_root = image_root
     config = BertConfig.from_json_file(
         os.path.join(scibert_path, 'config.json'))
     self.tokenizer = BertTokenizer(config=config,
                                    vocab_file=os.path.join(
                                        scibert_path, 'vocab.txt'))
     self.token_indexer = {
         'tokens':
         BertFromConfigIndexer(config=config,
                               vocab_path=os.path.join(
                                   scibert_path, 'vocab.txt'),
                               namespace='bert_tokens')
     }
     expected_img_size = 224
     self.image_transform = transforms.Compose([
         transforms.Resize(expected_img_size),
         transforms.CenterCrop(expected_img_size),
         transforms.ToTensor(),
         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
     ])
     self.use_refs = use_refs
     self.different_type_for_refs = different_type_for_refs
     self.limit = limit
     self.max_sequence_length = max_sequence_length
     self.word_tokenizer = WordTokenizer()
     self.caption_field = "caption"
Exemple #30
0
def acs_predict():

    dataset_dir = "../../datasets/acs-20210530-gold"

    # tokenizer
    tokenizer = BertTokenizer(
        "../../weights/biobert-pt-v1.0-pubmed-pmc/vocab.txt",
        do_lower_case=False)

    net = EndToEnd("../../weights/biobert-pt-v1.0-pubmed-pmc")
    net.load_state_dict(
        torch.load("../../weights/chemprot-cls-end-to-end/3layer-e2e-2"))
    net = net.cuda()
    net.eval()

    for pub_num in tqdm(os.listdir(
            dataset_dir)):  # find all data folder in the dataset directory
        article_dir = os.path.join(dataset_dir, pub_num)
        assert os.path.isdir(article_dir)
        dataset = ACSDataset(data_path=os.path.join(article_dir,
                                                    "re_input.tsv"),
                             tokenizer=tokenizer,
                             max_seq_len=128)
        dataloader = DataLoader(dataset=dataset,
                                batch_size=256,
                                num_workers=8,
                                shuffle=False,
                                collate_fn=acs_collate_fn)
        output = predict_net(pub_num, net, dataloader)
        with open(os.path.join(article_dir, "re_output.tsv"),
                  "w",
                  encoding="utf8") as fout:
            fout.write("id1\tid2\tclass\tconfidence\n")
            for _, (id1, id2, pred, score) in enumerate(output):
                fout.write(f"{id1}\t{id2}\t{pred}\t{score}\n")