Example #1
0
 def get_tokenizer(self):
     if self.hparams.model_type == 'bert':
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     elif self.hparams.model_type == 'bert-cased':
         tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
     elif self.hparams.model_type == 'bert-large':
         tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
     elif self.hparams.model_type == 'distilbert':
         tokenizer = DistilBertTokenizer.from_pretrained(
             'distilbert-base-uncased')
     elif self.hparams.model_type == 'roberta':
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
     elif self.hparams.model_type == 'roberta-large':
         tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
     elif self.hparams.model_type == 'albert':
         tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
     elif self.hparams.model_type == 'albert-xxlarge':
         tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
     elif self.hparams.model_type == 'electra':
         tokenizer = ElectraTokenizer.from_pretrained(
             'google/electra-base-discriminator')
     elif self.hparams.model_type == 'electra-large':
         tokenizer = ElectraTokenizer.from_pretrained(
             'google/electra-large-discriminator')
     else:
         raise ValueError
     return tokenizer
Example #2
0
 def _init_deep_model(self, model_type, model_path, num_labels, num_regs=None):
     if 'roberta' in model_type:
         tokenizer = RobertaTokenizer.from_pretrained(model_path)
         config = RobertaConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         model = RobertaForSequenceClassification.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     elif 'electra_multitask' in model_type:
         tokenizer = ElectraTokenizer.from_pretrained(model_path)
         tokenizer.add_special_tokens({'additional_special_tokens': ['[VALUES]']})
         config = ElectraConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         config.num_regs = num_regs
         config.vocab_size = len(tokenizer)
         model = ElectraForSequenceClassificationMultiTask.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     elif 'electra' in model_type:
         tokenizer = ElectraTokenizer.from_pretrained(model_path)
         config = ElectraConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         model = ElectraForSequenceClassification.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     else:
         raise NotImplementedError()
     return config, tokenizer, model
Example #3
0
def load_model_and_tokenizer(lm, model_dir=None):
    if lm == 'roberta':
        model_path = model_dir if model_dir else 'roberta-base'
        model = RobertaForMatres.from_pretrained(model_path)
        tokenizer = RobertaTokenizer.from_pretrained(model_path)
    elif lm == 'bert':
        model_path = model_dir if model_dir else 'bert-base-uncased'
        model = BertForMatres.from_pretrained(model_path)
        tokenizer = BertTokenizer.from_pretrained(model_path)
    elif lm == 'bert-large':
        model_path = model_dir if model_dir else 'bert-large-uncased'
        model = BertForMatres.from_pretrained(model_path)
        tokenizer = BertTokenizer.from_pretrained(model_path)
    elif lm == 'electra':
        model_path = model_dir if model_dir else 'google/electra-base-discriminator'
        model = ElectraForMatres.from_pretrained(model_path)
        tokenizer = ElectraTokenizer.from_pretrained(model_path)
    elif lm == 'electra-large':
        model_path = model_dir if model_dir else 'google/electra-large-discriminator'
        model = ElectraForMatres.from_pretrained(model_path)
        tokenizer = ElectraTokenizer.from_pretrained(model_path)
    else:
        raise RuntimeError(
            "Please specify valid model from {bert, bert-large, roberta, electra, electra-large}."
        )
    return model, tokenizer
Example #4
0
def main(args):
  """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
  # load tokenizer
  TOK_NAME = "monologg/koelectra-base-v3-discriminator" 
  #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)
  tokenizer = ElectraTokenizer.from_pretrained(TOK_NAME)

  # load my model
  MODEL_NAME = args.model_dir # model dir.
  model = ElectraForSequenceClassification.from_pretrained(args.model_dir)
  model.parameters
  model.to(device)

  # load test datset
  test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
  test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
  test_dataset = RE_Dataset(test_dataset ,test_label)

  # predict answer
  logits, predictions = inference(model, test_dataset, device)
  # make csv file with predicted answer
  # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

  output = pd.DataFrame(predictions, columns=['pred'])
  output.to_csv('./prediction/koelectra-submission6.csv', index=False)
Example #5
0
def test_mutual_dataset(args):
    tokenizer = ElectraTokenizer.from_pretrained(
        "google/electra-small-discriminator",
        do_lower_case=True,
        use_fast=True)
    data = MutualDataset(args.data_dir, "train", tokenizer)
    assert (len(data) == 7088 * 4)
Example #6
0
    def __init__(self):
        self.root_path = '..'
        self.checkpoint_path = f"{self.root_path}/checkpoint"
        self.save_ckpt_path = f"{self.checkpoint_path}/koelectra-wellnesee-text-classification.pth"
        model_name_or_path = "monologg/koelectra-base-discriminator"

        # 답변과 카테고리 불러오기
        self.category, self.answer = load_wellness_answer()

        ctx = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(ctx)

        # 저장한 Checkpoint 불러오기
        checkpoint = torch.load(self.save_ckpt_path, map_location=self.device)

        # Electra Tokenizer
        self.tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path)

        electra_config = ElectraConfig.from_pretrained(model_name_or_path)
        self.model = koElectraForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path=model_name_or_path,
            config=electra_config,
            num_labels=359)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.to(self.device)
        self.model.eval()
Example #7
0
def pred_sm(args):
    #os.environ['TRANSFORMERS_CACHE'] = os.path.dirname(os.path.realpath(__file__))+'/cache/'

    model=load_model(device)
    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", do_lower_case=False)

    if os.path.exists(args.out_infer_file):
        os.remove(args.out_infer_file)

    out_infer_folder = os.path.dirname(args.out_infer_file)
    if not os.path.exists(out_infer_folder): 
        os.makedirs(out_infer_folder)
        
    wf =  open(args.out_infer_file, "a")
    resultData = []
    fileList = []
    files = os.listdir(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR')
    files.sort()
    for filename in tqdm(files):
        if os.path.splitext(filename)[1].lower() == '.txt':
            fullPath = "{}\{}".format(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR', filename).replace('\\', '/')

            args.infer_file = fullPath
            test_dataset  = load_and_cache_examples(args, tokenizer, mode="infer") 
            preds = evaluate(args, model, test_dataset)

            filename=filename.split('.txt')[0]
            text = filename + ':' + str(preds[0])
            print(text, file=wf)
            fileList.append(filename)
            resultData.append(str(preds[0]))
    return fileList, resultData
Example #8
0
def main(args):
    GoogleSTT(args.in_wav_folder)

    model=load_model(device)
    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", do_lower_case=False)

    if os.path.exists(args.out_infer_file):
        os.remove(args.out_infer_file)

    out_infer_folder = os.path.dirname(args.out_infer_file)
    if not os.path.exists(out_infer_folder): 
        os.makedirs(out_infer_folder)
        
    wf =  open(args.out_infer_file, "a")

    files = os.listdir(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR')   
    for filename in tqdm(files):
        if os.path.splitext(filename)[1].lower() == '.txt':
            fullPath = "{}\{}".format('tmp_ASR', filename).replace('\\', '/')

            args.infer_file = fullPath
            test_dataset  = load_and_cache_examples(args, tokenizer, mode="infer") 
            preds = evaluate(args, model, test_dataset)

            filename=filename.split('.txt')[0]
            text = filename + ':' + str(preds[0])
            print(text, file=wf)
Example #9
0
    def __init__(self, root_path='../ai/chatbot'):
        checkpoint_path = f"{root_path}/checkpoint"
        self.model_path = f"{checkpoint_path}/koelectra-wellness-text-classification.pth"
        model_name_or_path = "monologg/koelectra-base-discriminator"

        checkpoint = torch.load(self.model_path, map_location=device)
        electra_config = ElectraConfig.from_pretrained(model_name_or_path)
        self.model = koElectraForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=electra_config, num_labels=359)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.to(device)
        self.model.eval()

        self.tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path)

        self.category = []
        idx = -1
        with open(root_path+'/data/wellness_data_for_text_classification.txt', 'r') as f:
            while True:
                line = f.readline()
                if not line:
                    break
                datas = line.strip().split("\t")
                if datas[1] != str(idx):
                    self.category.append(datas[2])
                idx += 1
Example #10
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    #   TOK_NAME = "bert-base-multilingual-cased"
    #TOK_NAME = args.pretrained_model
    #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)
    MODEL_NAME = 'google/electra-base-discriminator'
    tokenizer = ElectraTokenizer.from_pretrained(MODEL_NAME)

    # load my model
    model_module = getattr(import_module("transformers"),
                           args.model_type + "ForSequenceClassification")
    model = model_module.from_pretrained(args.model_dir)
    model.parameters
    model.to(device)

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test_with_pororo.txt"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)
    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

    output = pd.DataFrame(pred_answer, columns=['pred'])
    output.to_csv(args.out_path, index=False)
def get_model_and_tokenizer(model_name, device):
    save_ckpt_path = CHECK_POINT[model_name]

    if model_name == "koelectra":
        model_name_or_path = "monologg/koelectra-base-discriminator"

        tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path)
        electra_config = ElectraConfig.from_pretrained(model_name_or_path)
        model = koElectraForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path=model_name_or_path,
            config=electra_config,
            num_labels=359)
    elif model_name == 'kobert':
        tokenizer = get_tokenizer()
        model = KoBERTforSequenceClassfication()

    if os.path.isfile(save_ckpt_path):
        checkpoint = torch.load(save_ckpt_path, map_location=device)
        pre_epoch = checkpoint['epoch']
        # pre_loss = checkpoint['loss']
        model.load_state_dict(checkpoint['model_state_dict'])

        print(f"load pretrain from: {save_ckpt_path}, epoch={pre_epoch}")

    return model, tokenizer
Example #12
0
def prediction(text):
	tokenizer1 = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
	PATH = "/mnt/01D557900A25E360/Study/projects/ELECTRA_FINAL-20201210T023739Z-001/ELECTRA_FINAL/state_electra_final_model.pt"
	model1 = torch.load(PATH, map_location='cpu')
	line_tokenized = tokenizer1.batch_encode_plus(text,max_length=64,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)
	input_ids1 = line_tokenized["input_ids"]
	attention_masks1 = line_tokenized["attention_mask"]
	prediction_inputs1 = torch.tensor(input_ids1)
	prediction_masks1 = torch.tensor(attention_masks1)
	prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
	prediction_sampler1 = SequentialSampler(prediction_data1)
	prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=1)
	
	predictions = []
	
	for batch in prediction_dataloader1:
		batch = tuple(t.to('cpu') for t in batch)
		b_input_ids, b_input_mask = batch
		with torch.no_grad():
			
			outputs1 = model1(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
			print(outputs1)
		logits1 = outputs1[0]
		logits1 = logits1.detach().cpu().numpy()
		predictions.append(logits1)
		flat_predictions = [item for sublist in predictions for item in sublist]
		flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
	
	return flat_predictions
Example #13
0
def main(cli_args):
    # 파라미터 업데이트
    args = AttrDict(vars(cli_args))
    args.device = "cuda" if torch.cuda.is_available(
    ) and not args.no_cuda else "cpu"
    logger = logging.getLogger(__name__)

    # logger 및 seed 지정
    init_logger()
    set_seed(args)

    # 모델 불러오기
    if args.do_score:
        tokenizer = ElectraTokenizer.from_pretrained(
            args.model_name_or_path if args.from_init_weight else os.path.join(
                args.output_dir, "checkpoint-{}".format(args.checkpoint)),
            do_lower_case=args.do_lower_case,
        )
        only_scoring(args, tokenizer)
    else:
        model, tokenizer = create_model(args)
        # Running mode에 따른 실행
        if args.do_cache:
            prepro(args, tokenizer)

        if args.do_train:
            train(args, model, tokenizer, logger)
        elif args.do_eval:
            evaluate(args, model, tokenizer)
def predict_pair(model_args, data_args, training_args):
    # Set seed
    set_seed(training_args.seed)

    if 'roberta' in model_args.model_type:
        tokenizer = RobertaTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = RobertaConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = RobertaForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)
    elif 'electra' in model_args.model_type:
        tokenizer = ElectraTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = ElectraConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = ElectraForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)
    else:
        # default -> bert
        tokenizer = BertTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = BertConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = BertForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)

    model.to(training_args.device)

    test_df = pickle.load(open(data_args.test_data_file, 'rb'))
    test_dataset = get_dataset(data_args, tokenizer, test_df, model_args.model_type)
    data_collator = MyDataCollator()
    if training_args.local_rank != -1:
        sampler = SequentialDistributedSampler(test_dataset)
        model = torch.nn.DataParallel(model)
    else:
        n_gpu = torch.cuda.device_count()
        if n_gpu > 1:
            model = torch.nn.DataParallel(model)
        sampler = SequentialSampler(test_dataset)
    print(len(test_dataset))
    dataloader = DataLoader(
        test_dataset,
        sampler=sampler,
        batch_size=training_args.eval_batch_size,
        collate_fn=data_collator,
    )

    model.eval()
    all_probs = []
    for inputs in tqdm(dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(training_args.device)
        inputs.pop('labels')
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs[0]
            probs = torch.softmax(logits, dim=-1)
            maxp, maxi = torch.max(probs, dim=-1)
            result = [(_i, _p) for _p, _i in zip(maxp, maxi)]
            all_probs.extend(result)

    with open('./{}_{}.answer_classify.result'.format(data_args.data_type, model_args.model_type), 'w', encoding='utf-8') as fout:
        for i in range(len(test_df)):
            fout.write('{} | {} | {} | {} | {}\n'.format(test_df[i][0], test_df[i][1], test_df[i][2], all_probs[i][0], all_probs[i][1]))
Example #15
0
def punctuate_electra(input_text,
                      download_dir,
                      model_type="ELECTRA",
                      format="inline"):
    """Punctuate the input text with the ELECTRA model. Capitalize sentence beginnings."""
    get_model(model_type, download_dir)

    model_path = f"{download_dir}/{model_type}"
    config = AutoConfig.from_pretrained(model_path)
    tokenizer = ElectraTokenizer.from_pretrained(model_path)
    tokenizer.add_tokens(["<NUM>"])
    pytorch_model = ElectraForTokenClassification.from_pretrained(model_path)
    pytorch_model.resize_token_embeddings(len(tokenizer))

    punctuation_dict = {
        "COMMA": ",",
        "PERIOD": ".",
        "QUESTIONMARK": "?",
        "EXCLAMATIONMARK": "!",
        "COLON": ":",
        "SEMICOLON": ";",
        "DASH": "-",
    }
    eos_punct = [".", "?", "!"]

    labels = config.id2label

    # Read the input and clean of non-printable characters
    input_list = read_input(input_text, format).split()

    # split up long lines to not exceed the training sequence length

    n = 60
    text_to_punctuate = []
    if len(input_list) > n:
        line_part = [
            " ".join(input_list[x:x + n])
            for x in range(0, len(input_list), n)
        ]
        text_to_punctuate.extend(line_part)
    elif len(input_list) == 0:
        pass
    else:
        text_to_punctuate.append(" ".join(input_list))

    punctuated_text = []
    for t in text_to_punctuate:
        input_ids = tokenizer(t, return_tensors="pt")["input_ids"]
        tokens = tokenizer.tokenize(t)
        predictions = pytorch_model(input_ids)
        pred_ids = np.argmax(
            predictions[0].detach().numpy(),
            axis=2)[0]  # Take the first matrix, since only have batch size 1
        predictions = [labels[pred_ids[i]] for i in range(1, len(pred_ids))]
        line_punctuated = iterate(tokens, predictions, eos_punct,
                                  punctuation_dict)
        punctuated_text.append(line_punctuated)

    return upcase_first_letter(" ".join(punctuated_text))
Example #16
0
 def __init__(self, args, device):
     self.args = args
     self.tokenizer = ElectraTokenizer.from_pretrained(args.model_evaluator)
     self.model = ElectraModel.from_pretrained(
         args.model_evaluator).eval().to(device)
     self.alpha = args.alpha
     self.device = device
     self.cos = nn.CosineSimilarity(dim=-1, eps=1e-6)
Example #17
0
 def _test_ElectraForPreTraining(self, size, large=False):
     from transformers import ElectraTokenizer, TFElectraForPreTraining
     tokenizer = ElectraTokenizer.from_pretrained(size)
     model = TFElectraForPreTraining.from_pretrained(size)
     input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf")
     spec, input_dict = self.spec_and_pad(input_dict)
     outputs = ["logits"]
     self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
Example #18
0
 def _test_ElectraForMaskedLM(self, size, large=False):
     from transformers import ElectraTokenizer, TFElectraForMaskedLM
     tokenizer = ElectraTokenizer.from_pretrained(size)
     model = TFElectraForMaskedLM.from_pretrained(size)
     input_dict = tokenizer("The capital of France is [MASK].", return_tensors="tf")
     input_dict["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
     spec, input_dict = self.spec_and_pad(input_dict)
     outputs = ["logits"]
     self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
Example #19
0
def download_model(outputdir_tokenizer: str, outputdir_pretrained: str):
    slow_tokenizer = ElectraTokenizer.from_pretrained("bert-base-uncased")
    print("Save tokenizer to ", outputdir_tokenizer)
    slow_tokenizer.save_pretrained(outputdir_tokenizer)

    model = ElectraForQuestionAnswering.from_pretrained(
        "google/electra-base-discriminator")
    model.save_pretrained(outputdir_pretrained)
    print("Save model electra pretrained to", outputdir_pretrained)
Example #20
0
 def _test_ElectraForQuestionAnswering(self, size, large=False):
     from transformers import ElectraTokenizer, TFElectraForQuestionAnswering
     tokenizer = ElectraTokenizer.from_pretrained(size)
     model = TFElectraForQuestionAnswering.from_pretrained(size)
     question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
     input_dict = tokenizer(question, text, return_tensors='tf')
     spec, input_dict = self.spec_and_pad(input_dict, max_length=128)
     outputs = ["start_logits", "end_logits"]
     self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
Example #21
0
 def _test_ElectraForSequenceClassification(self, size, large=False):
     from transformers import ElectraTokenizer, TFElectraForSequenceClassification
     tokenizer = ElectraTokenizer.from_pretrained(size)
     model = TFElectraForSequenceClassification.from_pretrained(size)
     input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf")
     input_dict["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
     spec, input_dict = self.spec_and_pad(input_dict)
     outputs = ["logits"]
     self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
Example #22
0
 def _test_ElectraForTokenClassification(self, size, large=False):
     from transformers import ElectraTokenizer, TFElectraForTokenClassification
     tokenizer = ElectraTokenizer.from_pretrained(size)
     model = TFElectraForTokenClassification.from_pretrained(size)
     input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf")
     # input_ids = input_dict["input_ids"]
     # input_dict["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids)))
     spec, input_dict = self.spec_and_pad(input_dict, max_length=128)
     outputs = ["logits"]
     self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
Example #23
0
def main():
    start = time.time()

    cuda = torch.device('cuda:0')
    cpu = torch.device('cpu')
    device = cuda if torch.cuda.is_available() else cpu

    electra = 'monologg/koelectra-small-v2-discriminator'

    # KoELECTRA 모델 로드(v2)
    model = ElectraModel.from_pretrained(electra).to(device)
    tokenizer = ElectraTokenizer.from_pretrained(electra)

    # 문서 데이터 파일 로드
    with open(corpus_dir + 'doc_summary4.json', 'r', encoding='utf-8') as f:
        j_doc = json.load(f)

    train_sets = j_doc

    max_available_length = 0

    for doc_id in tqdm(train_sets):
        sents = j_doc[doc_id]['sents']
        encoded_sents = []
        max_in_sent = 0
        for sent in sents:
            a = tokenizer.encode(sent)
            if len(a) > max_in_sent:
                max_in_sent = len(a)

            encoded_sents.append(a)

        for i, e in enumerate(encoded_sents):
            encoded_sents[i] += [0] * (max_in_sent - len(e))

        split_unit = 8
        results = []
        for i in range(0, len(encoded_sents), split_unit):
            j = min(i + split_unit, len(encoded_sents))
            x_inputs = torch.LongTensor(encoded_sents[i:j]).to(device)
            y = model(x_inputs)
            y_np = y[0].cpu().detach()
            results.append(y_np)

        y_np = torch.cat(results).numpy()

        np.save(f'summary_embed2/{doc_id}.embed', y_np)

    # CPU로 실행 시간 테스트
    end = time.time()
    elapsed = end - start
    print(elapsed)

    print('Ok.')
Example #24
0
 def __init__(self, root, mode):
     self.data = pd.read_csv(os.path.join(root, '{}.csv'.format(mode)))
     self.mode = mode
     self.tokenizer = tokenizer = ElectraTokenizer.from_pretrained(
         'google/electra-base-discriminator')
     self.data['text'] = self.data['text'].apply(self.preprocess)
     if self.mode == 'train':
         self.data = self.data[self.data['text'] != '']
         self.data = self.data[['text', 'target']]
         self.labels = self.data.target.values
     self.text = self.data.text.values
 def test_transformers_embedding_1(self):
     from transformers import ElectraModel, ElectraTokenizer
     weight_path = "google/electra-small-generator"
     model = ElectraModel.from_pretrained(weight_path)
     tokenizer = ElectraTokenizer.from_pretrained(weight_path)
     encoder = TransformersWordPieceEncoder(model, tokenizer)
     ds = DataSet({'words': ["this is a test . [SEP]".split()]})
     encoder.index_datasets(ds, field_name='words')
     self.assertTrue(ds.has_field('word_pieces'))
     result = encoder(torch.LongTensor([[1,2,3,4]]))
     self.assertEqual(result.size(), (1, 4, model.config.hidden_size))
    def construct_encoder(self):
        model = ElectraModel.from_pretrained('google/' + self.model_name, output_hidden_states=True, output_attentions=True)
        model = ElectraExtractor(model, location=None, heads=None)
        model.cuda()
        model = torch.nn.DataParallel(model)
        model.eval()

        print('tokenizer', self.model_name)
        tokenizer = ElectraTokenizer.from_pretrained('google/' + self.model_name, do_lower_case=True)
        print("Model and tokenzier are constructed!")
        return model, tokenizer
    def test_transformers_embedding_1(self):
        from transformers import ElectraModel, ElectraTokenizer
        weight_path = "google/electra-small-generator"
        vocab = Vocabulary().add_word_lst("this is a test . [SEP] NotInRoberta".split())
        model = ElectraModel.from_pretrained(weight_path)
        tokenizer = ElectraTokenizer.from_pretrained(weight_path)

        embed = TransformersEmbedding(vocab, model, tokenizer, word_dropout=0.1)

        words = torch.LongTensor([[2, 3, 4, 1]])
        result = embed(words)
        self.assertEqual(result.size(), (1, 4, model.config.hidden_size))
Example #28
0
 def test_masked_by_flag(self):
     batch_paths = [
         '/client/user1/cuongdev/GenImputation/data/test/electra_G1K_22_hs37d5/corpus_dir/G1K_22_hs37d5_biallelic_test.r0000.b0000.page.gz'
     ]
     tokenizer = ElectraTokenizer(
         vocab_file=
         '/client/user1/cuongdev/GenImputation/data/train/electra_G1K_22_hs37d5/data_dir/vocab.txt'
     )
     test_dataset = GenNLPMaskedDataset(batch_paths,
                                        tokenizer,
                                        masked_by_flag=True,
                                        only_input=True)
     pass
def main():

    # Config
    config = TrainConfig()
    # Fixing Seed
    pl.seed_everything(config.seed)
    # Logger
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter("[%(asctime)s] %(message)s")
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    # Data Loading...
    raw_train_instances = load_data(config.train_file_path)
    raw_dev_instances = load_data(config.dev_file_path)
    logger.info(
        f"훈련용 예시 개수:{len(raw_train_instances)}\t 검증용 예시 개수:{len(raw_dev_instances)}"
    )

    tokenizer = ElectraTokenizer.from_pretrained(config.pretrained_model_name,
                                                 do_lower_case=False)

    train_dataset = AbusingDataset(raw_train_instances, tokenizer)
    valid_dataset = AbusingDataset(raw_dev_instances, tokenizer)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
        collate_fn=AbusingDataset.collate_fn,
    )
    val_dataloader = DataLoader(
        valid_dataset,
        batch_size=config.batch_size,
        num_workers=config.num_workers,
        collate_fn=AbusingDataset.collate_fn,
    )

    # Lightning
    lightning_module = AbusingClassifier(config, logger)
    trainer = pl.Trainer(
        gpus=config.gpus,
        max_epochs=config.num_epochs,
        deterministic=True,
        weights_save_path=config.save_model_file_prefix,
        gradient_clip_val=1.0,
    )
    trainer.fit(lightning_module, train_dataloader, val_dataloader)
Example #30
0
    def __init__(self, configs: dict):
        
        # 加载参数字典
        self.configs = configs
        
        # 设置模型以及路径
        if self.configs["model_select"] == "fasttext":
            self.model_path = self.configs["model_path"]["fasttext"]
        elif self.configs["model_select"] == "lstm_base":
            self.model_path = self.configs["model_path"]["lstm_base"]
        elif self.configs["model_select"] == "lstm_pack":
            self.model_path = self.configs["model_path"]["lstm_pack"]
        elif self.configs["model_select"] == "textcnn":
            self.model_path = self.configs["model_path"]["textcnn"]
        elif self.configs["model_select"] == "bert":
            self.model_path = self.configs["model_path"]["bert"]
        elif self.configs["model_select"] == "electra":
            self.model_path = self.configs["model_path"]["electra"]
        elif self.configs["model_select"] == "xlnet":
            self.model_path = self.configs["model_path"]["xlnet"]

        # 设置模型名称
        self.model_name = self.configs["model_select"]

        # 设置评估推断的数据文件
        self.eval_data_file = self.configs["eval_data_file"]

        # 设置标签转换路径,需要将预测的label转为真实标签
        self.label2index_json_path = self.configs["eval_label_transfer_file"]

        # 设置token映射词表,在自定义模型中的tokenizer使用
        self.token2index_json_path = self.configs["eval_token_transfer_file"]

        # 设置分词器
        if self.model_name in ["fasttext", "lstm_base", "lstm_pack", "textcnn"]:
            self.tokenizer = SequenceTokenizer(load_json(self.token2index_json_path))

        elif self.model_name == "bert":
            self.tokenizer = BertTokenizer.from_pretrained(self.configs["pretrained_model_path"]["bert"])
        elif self.model_name == "electra":
            self.tokenizer = ElectraTokenizer.from_pretrained(self.configs["pretrained_model_path"]["electra"])
        elif self.model_name == "xlnet":
            self.tokenizer = XLNetTokenizer.from_pretrained(self.configs["pretrained_model_path"]["xlnet"])

        # 设置label转换器
        self.label_tokenizer = ClassificationLabelTokenizer(load_json(self.label2index_json_path))

        # 加载模型
        self.model = torch.load(self.model_path)