Ejemplo n.º 1
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    enc = GPT2Tokenizer.from_pretrained('gpt2-medium')
    model = GPT2LMHeadModel.from_pretrained(
        'gpt2-medium')  #gpt2 as other option
    model.eval()
    model.to(device)

    # context is the universal trigger
    trigger = "TH PEOPLEMan goddreams Blacks"
    trigger_tokens = enc.encode(trigger)

    # get samples from the model
    for _ in range(100):
        out = sample_sequence(model=model,
                              length=200,
                              context=trigger_tokens,
                              temperature=1.5,
                              top_k=5)
        out = out[:, len(trigger_tokens):].tolist()
        for i in range(1):
            text = enc.decode(out[i])
            if text.find('<|endoftext|>') > 0:
                text = text[0:text.find('<|endoftext|>')]
            print("Prompt: " + trigger)
            print("Output: " + text)
            print("=" * 80)
Ejemplo n.º 2
0
def predict_next_word(text):

    import time
    t1 = time.time()
    indexed_tokens = tokenizer.encode(text)

    # Convert indexed tokens in a PyTorch tensor
    tokens_tensor = torch.tensor([indexed_tokens])

    # Load pre-trained model (weights)
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Set the model in evaluation mode to deactivate the DropOut modules
    model.eval()

    # If you have a GPU, put everything on cuda
    #tokens_tensor = tokens_tensor.to('cuda')
    #model.to('cuda')

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    # Get the predicted next sub-word
    predicted_index = torch.argmax(predictions[0, -1, :]).item()
    predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

    # Print the predicted word
    print(predicted_text)
    t2 = time.time()

    print("Time taken : ", t2-t1)
def generate_samples(args):
    """Use a pre-trained GPT-2 model to generate a set of samples from scratch."""
    # Set seed
    set_random_seeds(args.random_seed)

    # Initialize training
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print('Device: {}'.format(str(device)))

    # Load pre-trained network weights
    print('Loading pre-trained model...')
    config = GPT2Config.from_pretrained(args.gpt2_version)
    model = GPT2LMHeadModel(config)
    model.load_state_dict(torch.load(args.model_load_path))
    model = model.to(device)
    model.eval()

    # Create tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(args.gpt2_version)

    # Generate some samples
    print('Generating...')
    generated = generate_sequence(model,
                                  tokenizer,
                                  context=args.context,
                                  max_length=args.max_gen_len,
                                  num_samples=args.num_samples,
                                  top_k=args.sampling_top_k,
                                  device=device)
    print('Generated samples:')
    print(*generated, sep="\n---\n")
Ejemplo n.º 4
0
    def __init__(self, lookup, input_size, top_k, top_p, device):
        """ 
            Creates a Decoder with attention and Pointer network see https://nlp.stanford.edu/pubs/see2017get.pdf 
        """        
        super().__init__()
        
        self.device = device
        
        self.gpt2lmheadmodel = GPT2LMHeadModel.from_pretrained('gpt2')
        self.gpt2lmheadmodel.resize_token_embeddings(len(lookup))
        for param in self.gpt2lmheadmodel.parameters():
            param.requires_grad = False
            
        self.lookup = lookup
        self.emb_dim = 768
        self.hidden_dim = 768
        self.vocab_size = len(lookup)
        self.encoder_size = input_size
        self.top_k = top_k
        self.top_p = top_p
                
        self.output_linear = nn.Linear(hidden_dim, vocab_size)
        self.attention = Attention(encoder_size=input_size, decoder_size=self.hidden_dim, vocab_size=vocab_size, device=device)

        # overwrite output to allow context from the attention to be added to the output layer
        self.output_linear = nn.Linear(self.hidden_dim+self.encoder_size+self.emb_dim, int((self.hidden_dim+self.encoder_size+self.emb_dim)/2))
        self.vocab_linear = nn.Linear(int((self.hidden_dim+self.encoder_size+self.emb_dim)/2), self.vocab_size)

        self.p_gen_linear = nn.Linear(self.encoder_size + self.hidden_dim*2 + self.emb_dim, 1)
        
        self.to(device)
Ejemplo n.º 5
0
 def load(self):
     try:
         self._tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
         self._model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
     except:
         self._model = None
     return self
Ejemplo n.º 6
0
    def __init__(self, model_path='gpt2', top_k=None, top_p=None, device=None):
        super().__init__(device, top_k=top_k, top_p=top_p)
        self.model_path = model_path

        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel.from_pretrained(model_path)
        self.model.to(device)
        self.model.eval()
Ejemplo n.º 7
0
 def __init__(self):
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
     # TODO maybe smaller gpt2 model separately
     self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
     self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
     self.model.to(self.device)
     self.model.eval()
Ejemplo n.º 8
0
 def load_model(self,path='model/mini/'):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     device = "cpu"
     tokenizer = tokenization_bert.BertTokenizer(vocab_file=path+'vocab.txt')
     model = GPT2LMHeadModel.from_pretrained(path)
     model.to(device)
     model.eval()
     return model, tokenizer
Ejemplo n.º 9
0
    def __init__(self, **kwargs):
        self.beam_width = kwargs['beam_width']
        self.beam_depth = kwargs['beam_depth']
        self.timeout = kwargs['timeout']
        random.seed = kwargs['seed']

        self.model = GPT2LMHeadModel.from_pretrained('gpt2')
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
Ejemplo n.º 10
0
def initialize_training(args, device):
    """Initalize the tokenizer, the data loaders, the model and the tools of the optimization process."""
    # Create tokenizer, datasets and loaders
    tokenizer = EpisodeSummaryTokenizer.from_pretrained(
        args.gpt2_version,
        max_num_words=args.max_num_words,
        size_variance_handling=args.size_var_handling)
    train_dataset, val_dataset = create_datasets_from_jsons(
        args.json_paths, tokenizer, args.val_split)

    dataloaders = {
        'train':
        DataLoader(train_dataset,
                   shuffle=True,
                   batch_size=args.batch_size,
                   collate_fn=tokenizer.pad_batch_to_same_size),
        'val':
        DataLoader(val_dataset,
                   shuffle=False,
                   batch_size=args.batch_size,
                   collate_fn=tokenizer.pad_batch_to_same_size)
    }

    # Load pre-trained network weights
    model = GPT2LMHeadModel.from_pretrained(args.gpt2_version)
    model = model.to(device)

    # Prepare optimizer and scheduler
    no_decay = ['bias',
                'LayerNorm.weight']  # no decay for biases and layer norm
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=0,
                                     t_total=args.max_steps)
    model.zero_grad()

    train_state = make_train_state(
        save_path=args.model_save_path,
        early_stopping_patience=args.early_stopping_patience)

    return tokenizer, dataloaders, model, optimizer, scheduler, train_state
Ejemplo n.º 11
0
    def __init__(self, model_path='gpt2', device='cuda'):
        super().__init__()
        self.model_path = model_path
        self.device = device

        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel.from_pretrained(model_path)
        self.model.to(device)
        self.model.eval()
Ejemplo n.º 12
0
    def __init__(self):
        super(GPT2, self).__init__()

        self.model_type = "GPT2"

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

        # Load pre-trained model (weights)
        self.model = GPT2LMHeadModel.from_pretrained("gpt2")
    def gpt2(self, prep_obj):
        self.vector_corpus = []

        model = GPT2LMHeadModel.from_pretrained('gpt2')
        token_maker = GPT2Tokenizer.from_pretrained('gpt2')
        for tweet in prep_obj.detokenized_corpus:
            text_index = token_maker.encode(tweet)
            vector = (model.transformer.wte.weight[text_index, :])
            vector = vector.detach().numpy()
            vector = np.sum(vector, axis=0)
            self.vector_corpus.append(vector)
Ejemplo n.º 14
0
 def __init__(self, model_name: str) -> None:
     super().__init__()
     config = GPT2Config.from_pretrained(model_name)
     self.input_dim = config.hidden_size
     self.output_dim = config.vocab_size
     # TODO(mattg): It's possible that we could use some kind of cache like we have in
     # allennlp.modules.token_embedders.bert_token_embedder.PretrainedBertModel.  That way, we
     # would only load the GPT2 weights once.  Though, it's not clear how to do that here, as we
     # need to load `GPT2LMHeadModel`, not just `GPT2Model`...
     gpt2_model = GPT2LMHeadModel.from_pretrained(model_name)
     self.gpt2_lm_head = gpt2_model.lm_head
Ejemplo n.º 15
0
 def __init__(self, model_path, tokenizer_path):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = GPT2LMHeadModel.from_pretrained(model_path)
     model.to(device)
     model.eval()
     tokenizer = tokenization_bert_word_level.BertTokenizer(
         vocab_file=tokenizer_path)
     vocab = Gpt2Vocab(tokenizer)
     self.device = device
     self.model = model
     self.vocab = vocab
     self.tokenizer = tokenizer
Ejemplo n.º 16
0
    def __init__(self):
        if not os.path.exists(AGGREGATOR_DIR):
            os.makedirs(AGGREGATOR_DIR)
        if not os.path.isfile(AGGREGATOR_2015_2016):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2016_URL,
                          AGGREGATOR_2015_2016,
                          bar=self._download_progress_bar)
        if not os.path.isfile(AGGREGATOR_2015_2017):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2017_URL,
                          AGGREGATOR_2015_2017,
                          bar=self._download_progress_bar)
        if not os.path.isfile(AGGREGATOR_2015_2016_8_dim):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2016_8_dim_URL,
                          AGGREGATOR_2015_2016_8_dim,
                          bar=self._download_progress_bar)
        if not os.path.isfile(AGGREGATOR_2015_2017_8_dim):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2017_8_dim_URL,
                          AGGREGATOR_2015_2017_8_dim,
                          bar=self._download_progress_bar)
        if not os.path.isfile(ROBERTA_STS_PATH + '/checkpoint_best.pt'):
            print("Downloading ROBERTA STS model from s3...")
            wget.download(ROBERTA_STS_URL,
                          ROBERTA_STS_PATH + '/checkpoint_best.pt',
                          bar=self._download_progress_bar)
        if not os.path.isfile(ROBERTA_MNLI_PATH + '/model_mnli.pt'):
            print("Downloading ROBERTA MNLI model from s3...")
            wget.download(ROBERTA_MNLI_URL,
                          ROBERTA_MNLI_PATH + '/model_mnli.pt',
                          bar=self._download_progress_bar)

        self.roberta_STS = RobertaModel.from_pretrained(
            checkpoint_file='checkpoint_best.pt',
            model_name_or_path=ROBERTA_STS_PATH)
        self.roberta_STS.eval()

        self.roberta_MNLI = RobertaModel.from_pretrained(
            checkpoint_file='model_mnli.pt',
            model_name_or_path=ROBERTA_MNLI_PATH)
        self.roberta_MNLI.eval()
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')
        self.agg_one = load(AGGREGATOR_2015_2016)
        self.agg_two = load(AGGREGATOR_2015_2017)
        self.agg_one_8_dim = load(AGGREGATOR_2015_2016_8_dim)
        self.agg_two_8_dim = load(AGGREGATOR_2015_2017_8_dim)
        def create_and_check_lm_head_model(self, config, input_ids, head_mask,
                                           token_type_ids, *args):
            model = GPT2LMHeadModel(config)
            model.eval()

            loss, lm_logits, _ = model(input_ids,
                                       token_type_ids=token_type_ids,
                                       labels=input_ids)

            result = {"loss": loss, "lm_logits": lm_logits}

            self.parent.assertListEqual(list(result["loss"].size()), [])
            self.parent.assertListEqual(
                list(result["lm_logits"].size()),
                [self.batch_size, self.seq_length, self.vocab_size])
Ejemplo n.º 18
0
def gpt_predictor(n=3):
    if request.method == 'GET':
        return render_template('index.html', value='hi')

    if request.method == 'POST':
        tok = GPT2Tokenizer.from_pretrained("gpt2")
        model = GPT2LMHeadModel.from_pretrained("gpt2")
        text = request.form.get('text')
        n = request.form.get('n')
        for i in range(int(n)):
            pred = get_pred(text, model, tok)
            if pred == "<|endoftext|>":
                break
            else:
                text += pred
        return render_template('result.html', text=text)
Ejemplo n.º 19
0
def main():
    LENGTH = -1
    BATCH_SIZE = 1
    NSAMPLES = 18
    TEMPERATURE = 0.5
    TOPK = 5

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file='cache/vocab.txt')
    model_config = pytorch_transformers.GPT2Config.from_json_file(
        'model_config.json')
    model = GPT2LMHeadModel(
        config=model_config).from_pretrained('model/final_model')
    model.to(device)
    model.eval()

    if LENGTH == -1:
        LENGTH = model.config.n_ctx // 2
    elif LENGTH > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         model.config.n_ctx)

    while True:
        raw_text = input("Model prompt >>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input("Model prompt >>> ")
        context_tokens = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(raw_text))
        generated = 0
        for _ in range(NSAMPLES // BATCH_SIZE):
            out = sample_sequence(model=model,
                                  length=LENGTH,
                                  context=context_tokens,
                                  start_token=None,
                                  batch_size=BATCH_SIZE,
                                  temperature=TEMPERATURE,
                                  top_k=TOPK,
                                  device=device)
            out = out[:, len(context_tokens):].tolist()
            for i in range(BATCH_SIZE):
                generated += 1
                text = tokenizer.convert_ids_to_tokens(out[i])
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                print(text)
        print("=" * 80)
def get_model(seed=1234, model_name='gpt2'):
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained(model_name)
    enc.unk_token = None
    enc.bos_token = None
    enc.eos_token = None

    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.to(device)
    model.eval()
    #model.double()

    return enc, model
Ejemplo n.º 21
0
def predict_next_word(phrase):
    """
    Function to process the phrase using GPT-2
    :param phrase:
    :return:
    """
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Tokenize the input phrase
    tokenized_phrase = tokenizer.encode(phrase)
    print("Tokenized Phrase: {}".format(tokenized_phrase))

    # Convert tokenized phrase to pytorch tensor
    tokenized_phrase_tensor = torch.tensor([tokenized_phrase])
    print("Tokenized Phrase Tensor: {}".format(tokenized_phrase_tensor))

    # Load pretrainied model. This will have weights and bias
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Set the model in evaluation mode to deactivate drop-out. (Back-prop)
    model.eval()

    try:
        tokenized_phrase_tensor = tokenized_phrase_tensor.to('cuda')
        model.to('cuda')
        print("CUDA present.Running code on GPU")
    except AssertionError:
        print("Torch not compiled with CUDA. Running on CPU.")
    except Exception:
        print("CUDA not present. Running on CPU")

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokenized_phrase_tensor)
        print("Outputs: {}".format(outputs))

        predictions = outputs[0]
        print("Prediction: {}".format(predictions))

    # Get the predicted next sub-word
    predicted_index = torch.argmax(predictions[0, -1, :]).item()
    predicted_text = tokenizer.decode(tokenized_phrase + [predicted_index])

    return predicted_text
Ejemplo n.º 22
0
def main():
    length = -1
    batch_size = 1
    nsamples = 18
    temperature = 1
    topk = 5

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(
        vocab_file='cache/vocab_small.txt')
    model_config = pytorch_transformers.GPT2Config.from_json_file(
        'config/model_config_small.json')
    model = GPT2LMHeadModel(
        config=model_config).from_pretrained('model/final_model')
    model.to(device)
    model.eval()

    if length == -1:
        length = model.config.n_ctx // 2
    elif length > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         model.config.n_ctx)

    while True:
        raw_text = '萧炎'
        context_tokens = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(raw_text))
        generated = 0
        for _ in range(nsamples // batch_size):
            out = sample_sequence(model=model,
                                  length=length,
                                  context=context_tokens,
                                  start_token=None,
                                  batch_size=batch_size,
                                  temperature=temperature,
                                  top_k=topk,
                                  device=device)
            out = out[:, len(context_tokens):].tolist()
            for i in range(batch_size):
                generated += 1
                text = tokenizer.convert_ids_to_tokens(out[i])
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                print(''.join(text))
        print("=" * 80)
Ejemplo n.º 23
0
def load_model(args):
    """
    Load model and the corresponding tokenizer from pre-trained weight.
    :param args: The command line arguments.
    :return model: The main model.
    :return tokenzier: The tokenzier comes with the main model.
    """
    USE_CUDA = torch.cuda.is_available()
    # ====== Load GPT2 model ========
    model_dir = '../models/' + args.model_dir
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    # model = GPT2LMHeadModel.from_pretrained('gpt2')
    if USE_CUDA:
        model.cuda()
    tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
    # tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    print('Model loaded.')
    return model, tokenizer
Ejemplo n.º 24
0
def evaluate_ppl_gpt(args):
    """
    Evaluate on raw text, use this with GPT which has its own tokenizer
    """
    if args.expanded_dataset:
        path = ".data/stories/story_commonsense/torchtext_expanded"
    else:
        path = ".data/stories/story_commonsense/torchtext"
    # Data
    test_src = [line.rstrip('\n') for line in open(path + "/test.src")]
    test_trg = [line.rstrip('\n') for line in open(path + "/test.trg")]

    # Model
    enc = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.to(device)
    model.eval()
    loss = 0
    batch_size = 1

    print("Evaluating test set with GPT2")
    for i in trange(len(test_src)):
        src, trg = test_src[i], test_trg[i]
        context = enc.encode(src)
        target = enc.encode(trg)
        length = len(target)

        # Generate prediction
        out = utils.sample_sequence(model,
                                    length,
                                    batch_size=1,
                                    context=context,
                                    top_k=10,
                                    device=device)
        out = out[:, len(context):]

        # Get model loss
        target = torch.tensor([target]).to(device)
        with torch.no_grad():
            #pred, past  = model(out)
            l = model(out, labels=target)
            loss += float(l)
    av_loss = loss / len(loss)
    print(f"ppl: {math.exp(av_loss):.04f}")
Ejemplo n.º 25
0
def gpt_predictor(request, n=3):
    tok = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    if request.method == 'GET':
        return "Welcome to GPT predictor"

    if request.method == 'POST':
        data = request.get_json()
        text = data["text"]
        res = []
        n = data["n"]
        for i in range(n):
            pred = get_pred(text, model, tok)
            if pred == "<|endoftext|>":
                break
            else:
                text += pred
        return text
Ejemplo n.º 26
0
def get_textgen(sentence: str) -> str:
    """
    Runs text_generation GPT2 model and returns generated text.
    :param sentence: sentence taken from serializer.data.
    :return: Generated text.
    """
    output_dir = './models/text_gen'
    tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
    model = GPT2LMHeadModel.from_pretrained(output_dir)
    tokens = tokenizer.encode(sentence)
    tokens_tensor = torch.tensor([tokens])
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokens_tensor = tokens_tensor.to(device)
    model.to(device)
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]
    predicted_index = torch.argmax(predictions[0, -1, :]).item()
    predicted_text = tokenizer.decode(tokens + [predicted_index])
    return predicted_text
Ejemplo n.º 27
0
def zero_shot_gpt2(args):
    print('Get model')
    config = GPT2Config.from_pretrained('gpt2')
    model = GPT2LMHeadModel(config)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    print("Evaluating Maslow test set with GPT2")
    path = ".data/stories/story_commonsense/torchtext_class/maslow/"
    src_query = " That made them"
    trg_query = " feel"  # need to split due to offset in loss
    ma_t_pred, ma_t_true = \
        evaluate_zero_shot(args, model, tokenizer, path, src_query, trg_query)

    # Maslow results
    t_acc = accuracy_score(ma_t_true, ma_t_pred)
    t_f1 = f1_score(ma_t_true, ma_t_pred, average='macro')
    t_p = precision_score(ma_t_true, ma_t_pred, average='macro')
    t_r = recall_score(ma_t_true, ma_t_pred, average='macro')
    print('Maslow')
    print(
        f'\t Test | acc: {t_acc:7.4f} | f1: {t_f1:7.4f} | prec: {t_p:7.4f} | rec: {t_r:7.4f}'
    )

    print("Evaluating Reiss test set with GPT2")
    path = ".data/stories/story_commonsense/torchtext_class/reiss/"
    src_query = " They did this to"
    trg_query = " to"  # need to split due to offset in loss
    re_t_true, re_t_pred = \
        evaluate_zero_shot(args, model, tokenizer, path, src_query, trg_query)

    # Reiss results
    t_acc = accuracy_score(re_t_true, re_t_pred)
    t_f1 = f1_score(re_t_true, re_t_pred, average='macro')
    t_p = precision_score(re_t_true, re_t_pred, average='macro')
    t_r = recall_score(re_t_true, re_t_pred, average='macro')
    print('Reiss')
    print(
        f'\t Test | acc: {t_acc:7.4f} | f1: {t_f1:7.4f} | prec: {t_p:7.4f} | rec: {t_r:7.4f}'
    )
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--lr",default=5e-5, type=float, required=True, help="learning rate")
	parser.add_argument("--seed",default=42, type=int, required=False, help="seed to replicate results")
	parser.add_argument("--n_gpu",default=1, type=int, required=False, help="no of gpu available")
	parser.add_argument("--gradient_accumulation_steps",default=32, type=int, required=True, help="gradient_accumulation_steps")
	parser.add_argument("--batch_size",default=1, type=int, required=True, help="batch_size")
	parser.add_argument("--num_workers",default=4, type=int, required=False, help="num of cpus available")
	parser.add_argument("--device",default=torch.device('cuda'), required=False, help="torch.device object")
	parser.add_argument("--num_train_epochs",default=5, type=int, required=True, help="no of epochs of training")
	parser.add_argument("--output_dir",default=./output, type=str, required=True, help="path to save evaluation results")
	parser.add_argument("--model_dir",default=./weights, type=str, required=True, help="path to save trained model")
	parser.add_argument("--fp16",default=True, type=bool, required=False, help="whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
	parser.add_argument("--fp16_opt_level",default='O0', type=str, required=False, help="apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].")
	parser.add_argument("--max_grad_norm",default=1.0, type=float, help="max gradient norm.")
	parser.add_argument("--root_dir",default='./CNN/gpt2_1024_data', type=str, help="location of json dataset.")
	parser.add_argument("--ids_file",default='./CNN/ids.json', type=str, help="location of train, valid and test file indexes")
	args = parser.parse_args()

	train_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='train',length=3000) #training on only 3000 datasets
	valid_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='valid',length=500)  #validation on only 500 datasets
	tokenizer = add_special_tokens()
	ignore_idx = tokenizer.pad_token_id
   	model = GPT2LMHeadModel.from_pretrained('gpt2')
    	model.resize_token_embeddings(len(tokenizer))
   	model.to(args.device)

	start = time.time()
	train(args, model, tokenizer, train_data, valid_data, ignore_index)
	print('total time: ', (time.time()-start)/60, " minutes", end='\n\n')

	print('Saving trained model...')
	model_file = os.path.join(args['model_dir'], 'model_{}_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.bin'.format(args['fp16_opt_level'],3000,args['num_train_epochs']))
	config_file = os.path.join(args['model_dir'], 'config_{}_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.json'.format(args['fp16_opt_level'],3000,args['num_train_epochs']))
	torch.save(model.state_dict(), model_file)
	model.config.to_json_file(config_file)
Ejemplo n.º 29
0
def get_model(seed=1234, model_name='gpt2'):
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained(
        'D:/OneDrive - whu.edu.cn/桌面/NeuralSteganography-master1/pretrained_model'
    )  # local predownload pretrained_model
    # enc = GPT2Tokenizer.from_pretrained(model_name)
    enc.unk_token = None
    enc.bos_token = None
    enc.eos_token = None

    model = GPT2LMHeadModel.from_pretrained(
        'D:/OneDrive - whu.edu.cn/桌面/NeuralSteganography-master1/pretrained_model'
    )
    # model = GPT2LMHeadModel.from_pretrained(model_name)

    model.to(device)
    model.eval()
    #model.double()

    return enc, model
Ejemplo n.º 30
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--length',
                        default=-1,
                        type=int,
                        required=False,
                        help='生成长度')
    parser.add_argument('--temperature',
                        default=1,
                        type=float,
                        required=False,
                        help='生成温度,越高越随机')
    parser.add_argument('--topk',
                        default=8,
                        type=int,
                        required=False,
                        help='生成的时候最高几选一')
    parser.add_argument('--topp',
                        default=0,
                        type=float,
                        required=False,
                        help='生成的时候积累概率最高多少')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='模型参数路径')
    parser.add_argument('--tokenizer_path',
                        default='cache/bud_vocab.txt',
                        type=str,
                        required=False,
                        help='词表路径')
    parser.add_argument('--model_path',
                        default='model_bud/',
                        type=str,
                        required=False,
                        help='模型路径')
    parser.add_argument('--save_path',
                        default='generated/',
                        type=str,
                        required=False,
                        help='存放生成的文件的路径')
    parser.add_argument('--articles_per_title',
                        default=5,
                        type=int,
                        required=False,
                        help='每个标题生成多少篇文章')
    parser.add_argument('--titles',
                        default='萧炎',
                        type=str,
                        required=False,
                        help='标题列表,是一个字符串,用空格分开')
    parser.add_argument('--titles_file',
                        default='',
                        type=str,
                        required=False,
                        help='标题列表文件,文件中每行一个标题。如果这个选项有值则titles无效')
    parser.add_argument('--no_wordpiece',
                        action='store_true',
                        help='不做word piece切词')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.no_wordpiece:
        from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert
    elif args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    length = args.length
    temperature = args.temperature
    topk = args.topk
    topp = args.topp
    titles = args.title.split()  # 列表,里面每个元素是一个生成的标题
    if args.titles_file:
        with open(args.titles_file, 'r') as f:
            titles = [line.strip('\n') for line in f.readlines()]
    articles_per_title = args.articles_per_title  # 这里定义一个标题生成多少篇文章
    save_path = args.save_path  # 设置存到哪

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    model_config = pytorch_transformers.GPT2Config.from_json_file(
        args.model_config)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model.to(device)
    model.eval()

    if not os.path.exists(save_path):
        os.mkdir(save_path)
    if length == -1:
        length = model.config.n_ctx // 2
    elif length > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         model.config.n_ctx)

    for i, title in enumerate(titles):
        for j in range(articles_per_title):
            with open(save_path + str(i * j), 'w') as f:
                context_tokens = tokenizer.convert_tokens_to_ids(
                    tokenizer.tokenize(title))
                generated = 0
                out = sample_sequence(model=model,
                                      length=length,
                                      context=context_tokens,
                                      temperature=temperature,
                                      top_k=topk,
                                      top_p=topp,
                                      device=device)
                out = out.tolist()

                generated += 1
                text = tokenizer.convert_ids_to_tokens(out[0])

                for i, item in enumerate(text[:-1]):  # 确保英文前后有空格
                    if is_word(item) and is_word(text[i + 1]):
                        text[i] = item + ' '

                for i, item in enumerate(text):
                    if item == '[MASK]':
                        text[i] = ''
                    if item == '[CLS]' or item == '[SEP]':
                        text[i] = '\n'

                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                text = ''.join(text).replace('##', '').strip()
                # text = ''.join(text.split('\n')[:-1])
                print(text)
                f.write(text)
                print("=" * 80)