Python Tokenizer.set_vocab Examples, utils.Tokenizer.set_vocab Python Examples

Example #1

0

Show file

File: preprocess_seq2seq.py Project: yklu0330/ADL_2020

def main(args):
    # with open(args.output_dir / 'config.json') as f:
    #     config = json.load(f)

    # loading datasets from jsonl files
    # with open(config['train']) as f:
    #     train = [json.loads(line) for line in f]
    with open(args.valid_data_path) as f:
        valid = [json.loads(valid) for valid in f]
    # with open(config['test']) as f:
    #     test = [json.loads(line) for line in f]

    logging.info('Collecting documents...')
    documents = ([sample['text'] for sample in valid])

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=True)
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    with open('embedding2.pkl', 'rb') as f:
        embedding = pickle.load(f)

    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating valid dataset...')
    create_seq2seq_dataset(process_samples(tokenizer, valid),
                           'valid_seq2seq.pkl', tokenizer.pad_token_id)

Example #2

0

Show file

File: preprocess_seq2seq.py Project: zaq851017/NTUADL

def main(args):

    # loading datasets from jsonl files
    with open(args.input_data_path) as f:
        valid = [json.loads(valid) for valid in f]

    logging.info('Collecting documents...')
    documents = ([sample['text'] for sample in valid])

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=True)
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    """
    embedding = Embedding("./glove.6B.300d.txt", words=words)
    with open('./embedding.pkl', 'wb') as f:
        pickle.dump(embedding, f)
    """
    with open('./embedding.pkl', 'rb') as file:
        embedding = pickle.load(file)
    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating valid dataset...')
    create_seq2seq_dataset(process_samples(tokenizer, valid), 'data.pkl',
                           tokenizer.pad_token_id)

Example #3

0

Show file

File: preprocess_seq2seq_test.py Project: zengyuhuei/NTU-ADL

def main(args):
    
    with open(args.test_input) as f:
        test = [json.loads(line) for line in f]

    logging.info('Collecting documents...')
    documents = (
        [sample['text'] for sample in test]
    )

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=True)
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    with open(args.embedding_file, 'rb') as f:
        embedding = pickle.load(f)

    tokenizer.set_vocab(embedding.vocab)

  
    logging.info('Creating test dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, test),
        args.test_output,
        tokenizer.pad_token_id
    )

Example #4

0

Show file

File: preprocess_seq_tag_test.py Project: DylanHuang126/seq2seq_summarization

def main(path):
    with open(path) as f:
        test = [json.loads(line) for line in f]

    with open("./datasets/seq_tag/embedding.pkl", "rb") as f:
        embedding = pickle.load(f)

    tokenizer = Tokenizer(embedding.vocab, lower=True)
    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating test dataset...')
    create_seq_tag_dataset(process_seq_tag_samples(tokenizer, test),
                           './datasets/seq_tag/test.pkl')

Example #5

0

Show file

def main(args):
    with open(args.output_dir / 'config.json') as f:
        config = json.load(f)

    # loading datasets from jsonl files
    with open(config['train']) as f:
        train = [json.loads(line) for line in f]
    with open(config['valid']) as f:
        valid = [json.loads(valid) for valid in f]
    with open(config['test']) as f:
        test = [json.loads(line) for line in f]

    logging.info('Collecting documents...')
    documents = (
        [sample['text'] for sample in train]
        + [sample['summary'] for sample in train]
        + [sample['text'] for sample in valid]
        + [sample['text'] for sample in test]
    )

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=config['lower_case'])
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    embedding = Embedding(config['embedding'], words=words)
    with open(args.output_dir / 'embedding.pkl', 'wb') as f:
        pickle.dump(embedding, f)

    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating train dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, train),
        args.output_dir / 'train.pkl', config,
        tokenizer.pad_token_id
    )
    logging.info('Creating valid dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, valid),
        args.output_dir / 'valid.pkl', config,
        tokenizer.pad_token_id
    )
    logging.info('Creating test dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, test),
        args.output_dir / 'test.pkl', config,
        tokenizer.pad_token_id
    )

Example #6

0

Show file

File: preprocess_seq_tag.py Project: IPINGCHOU/NTU_ADL

def main(args):
    # Read test file
    with open(args.input_dataname) as f:
        test = [json.loads(line) for line in f]
    # Read embedding
    with open(str(args.output_dir) + '/embedding_tag.pkl', 'rb') as f:
        embedding = pickle.load(f)

    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating test dataset...')
    create_seq_tag_dataset(process_seq_tag_samples(tokenizer, test),
                           args.output_dir / 'test_tag.pkl',
                           tokenizer.pad_token_id)

Example #7

0

Show file

File: eval_seq2seq.py Project: zengyuhuei/NTU-ADL

def eval(args):
    batch_size = 32
    train_on_gpu = torch.cuda.is_available()

    enc = RNNEncoder(300, args.embedding_file)
    dec = RNNDecoder(300, args.embedding_file)

    device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

    model = Seq2Seq(enc, dec, device).to(device)
    ckpt = torch.load(args.model_path)
    model.load_state_dict(ckpt['state_dict'])

    model.eval()

    embedding_matrix = pickle.load(open(args.embedding_file, 'rb'))
    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding_matrix.vocab)
    eval_data = pickle.load(open(args.test_data_path, 'rb'))
    eval_loader = DataLoader(eval_data,
                             batch_size=batch_size,
                             num_workers=0,
                             shuffle=False,
                             collate_fn=eval_data.collate_fn)

    output_file = open(args.output_path, 'w')
    val_losses = []
    prediction = {}
    for batch in tqdm(eval_loader):
        pred = model(batch, 0)
        pred = torch.argmax(pred, dim=2)
        # batch, seq_len

        for i in range(len(pred)):
            prediction[batch['id'][i]] = tokenizer.decode(
                pred[i]).split('</s>')[0].split(' ', 1)[1]
    pred_output = [
        json.dumps({
            'id': key,
            'predict': value
        })
        for key, value in sorted(prediction.items(), key=lambda item: item[0])
    ]
    output_file.write('\n'.join(pred_output))
    output_file.write('\n')
    output_file.close()

Example #8

0

Show file

def main(args):

    with open(args.output_dir / 'config.json', 'r') as f:
        config = json.load(f)

    with open(args.input_data) as f:
        test = [json.loads(line) for line in f]

    with open(os.path.join(args.output_dir, "embedding.pkl"), 'rb') as f:
        embedding = pickle.load(f)

    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating test dataset...')
    create_seq2seq_dataset(process_samples(tokenizer, test),
                           args.output_dir / 'test_seq.pkl', config,
                           tokenizer.pad_token_id)

Example #9

0

Show file

File: getSeq2Seq.py Project: acupofhotwater/ADL2020-HW1-Summarization

def main(argv):
    with open(CONFIG, 'r') as f:
        config = json.load(f)

    # loading datasets from jsonl files
    testName = argv[1]
    with open(testName, 'r') as f:
        test = [json.loads(line) for line in f]

    tokenizer = Tokenizer(lower=config['lower_case'])

    logging.info('Loading embedding...')
    with open(ENBEDDINT_NAME, 'rb') as f:
        embedding = pickle.load(f)

    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating test dataset...')
    create_seq2seq_dataset(process_samples(tokenizer, test), 'testSeq2Seq.pkl',
                           config, tokenizer.pad_token_id)

Example #10

0

Show file

def eval(args):
   
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #print(device)
    BATCH_SIZE = 32

    ENC_HID_DIM = 128
    DEC_HID_DIM = 128
    N_LAYERS = 1
    ENC_DROPOUT = 0.5
    DEC_DROPOUT = 0.5
    PADDING_INDEX = 0

    embedding = pickle.load(open(args.embedding_file, 'rb'))
    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding.vocab)
    embedding_matrix = embedding.vectors.to(device)

    output_dim = len(embedding.vectors)
    embedding_dim = 300


    attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
    encoder = Encoder(embedding_dim, ENC_HID_DIM, DEC_HID_DIM,
                      embedding_matrix, N_LAYERS, ENC_DROPOUT)
    decoder = Decoder(output_dim, embedding_dim,
                      ENC_HID_DIM, DEC_HID_DIM, embedding_matrix, N_LAYERS, DEC_DROPOUT, attn)

    model = Seq2Seq(encoder, decoder, PADDING_INDEX, device).to(device)


    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=PADDING_INDEX)
    ckpt = torch.load(args.model_path)
    model.load_state_dict(ckpt['state_dict'])
    
        
    model.eval()

    
    eval_data = pickle.load(open(args.test_data_path, 'rb'))
    eval_loader = DataLoader(eval_data, batch_size=BATCH_SIZE, num_workers=0, shuffle=False, collate_fn=eval_data.collate_fn)
    
    
  
    output_file = open(args.output_path, 'w')
    val_losses = []
    prediction={}
    for batch in tqdm(eval_loader):
        #print(text.size())
        pred ,attention= model(batch, 0)
        #print(pred.size())
        pred = torch.argmax(pred, dim=2)
        #print(pred.size())
        pred = pred.permute(1, 0)
        #print(pred.size())
        
   
        for i in range(len(pred)):
            prediction[batch['id'][i]] = tokenizer.decode(pred[i]).split('</s>')[0].split(' ',1)[1]
    pred_output = [json.dumps({'id':key, 'predict': value}) for key, value in sorted(prediction.items(), key=lambda item: item[0])]
    output_file.write('\n'.join(pred_output))
    output_file.write('\n')
    output_file.close()

Example #11

0

Show file

File: train_seq2seq.py Project: zengyuhuei/NTU-ADL

def main():
    TRAIN = 'datasets/seq2seq/train.pkl'
    train = pickle.load(open(TRAIN, 'rb'))

    device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
    batch_size = 32
    train_loader = DataLoader(train,
                              batch_size=batch_size,
                              num_workers=0,
                              shuffle=False,
                              collate_fn=train.collate_fn)

    embedding_matrix = pickle.load(open("datasets/seq2seq/embedding.pkl",
                                        'rb'))
    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding_matrix.vocab)

    encoder = RNNEncoder(300, "datasets/seq2seq/embedding.pkl")
    decoder = RNNDecoder(300, "datasets/seq2seq/embedding.pkl")

    model = Seq2Seq(encoder, decoder, device).to(device)
    print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    n_epochs = 6
    print_every = 2238
    counter = 0
    valid_loss_min = np.Inf
    model.train()
    for epoch in range(1, n_epochs + 1):
        logging.info('Training')
        train_losses = []
        loss = 0
        counter = 0
        for batch in tqdm(train_loader):
            counter += 1
            #print("begining")
            #print(batch['text'].size())
            #[batch txt_len]
            #print(batch['summary'].size())
            #[batch trg_len]
            #print(len(batch['padding_len']))
            #[batch]
            optimizer.zero_grad()

            output = model(batch)
            #print("model output")
            #print(output.size())
            #[batch, trg_len, embedding word]
            output_dim = output.shape[-1]
            #print(output[:,0,:])
            output = output[:, 1:, :].reshape(-1, output_dim)
            #print(output.size())
            #[batch*(trg_len -1), embedding word]
            target = batch['summary'][:, 1:].reshape(-1).to(device)

            #print(target.size())
            #[batch*(trg_len-1)]
            loss = criterion(output, target)
            train_losses.append(loss.item())
            loss.backward()
            optimizer.step()

        checkpoint_path = f'src/model_state/seq2seq/ckpt.{epoch}.pt'
        torch.save({
            'state_dict': model.state_dict(),
            'epoch': epoch,
        }, checkpoint_path)
        print("Epoch: {}/{}...".format(epoch, n_epochs),
              "Loss: {:.6f}...".format(np.mean(train_losses)))

Example #12

0

Show file

     config = json.load(f)
     tokenizer = Tokenizer(lower=config['lower_case'])
 #print(tokenizer)
 solver = Solver(tokenizer=tokenizer)
 arg = sys.argv
 solver = Solver()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 if arg[1] == "train":
     #python src/main.py train batch_size
     with open("datasets/seq2seq/train.pkl", 'rb') as f:
         train = pickle.load(f)
     with open("datasets/seq2seq/valid.pkl", 'rb') as f:
         valid = pickle.load(f)
     with open("datasets/seq2seq/embedding.pkl", 'rb') as f:
         embedding = pickle.load(f)
     tokenizer.set_vocab(embedding.vocab)
     solver.tokenizer = tokenizer
     batch_size = int(arg[2])
     t_l = len(train)
     if t_l%batch_size==0:    
         t_bl = t_l//batch_size
     else:
         t_bl = t_l//batch_size+1
     v_l = len(valid)
     if v_l%batch_size==0:
         v_bl = v_l//batch_size
     else:
         v_bl = v_l//batch_size+1
     train_batches = [train.collate_fn([train[j] for j in range(i*batch_size,min((i+1)*batch_size,t_l))]) for i in range(t_bl)]
     print(train_batches[0]['summary'][0])
     #print(batches['text'][0:batch_size])

Example #13

0

Show file

def main(args):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #print(device)
    BATCH_SIZE = 32

    ENC_HID_DIM = 128
    DEC_HID_DIM = 128
    N_LAYERS = 1
    ENC_DROPOUT = 0.5
    DEC_DROPOUT = 0.5
    PADDING_INDEX = 0

    embedding = pickle.load(open(args.embedding_file, 'rb'))
    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding.vocab)
    embedding_matrix = embedding.vectors.to(device)

    output_dim = len(embedding.vectors)
    embedding_dim = 300

    attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
    encoder = Encoder(embedding_dim, ENC_HID_DIM, DEC_HID_DIM,
                      embedding_matrix, N_LAYERS, ENC_DROPOUT)
    decoder = Decoder(output_dim, embedding_dim, ENC_HID_DIM, DEC_HID_DIM,
                      embedding_matrix, N_LAYERS, DEC_DROPOUT, attn)

    model = Seq2Seq(encoder, decoder, PADDING_INDEX, device).to(device)

    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=PADDING_INDEX)
    ckpt = torch.load(args.model_path)
    model.load_state_dict(ckpt['state_dict'])
    eval_data = pickle.load(open(args.test_data_path, 'rb'))
    eval_loader = DataLoader(eval_data,
                             batch_size=BATCH_SIZE,
                             num_workers=0,
                             shuffle=False,
                             collate_fn=eval_data.collate_fn)

    val_losses = []
    prediction = {}
    for batch in tqdm(eval_loader):
        text = batch['text'].to(device)
        text_len = batch['padding_len']
        truth = batch["summary"].to(device)

        text = text.permute(1, 0)
        truth = truth.permute(1, 0)
        #print(text.size())
        pred, attn = model(text, text_len, truth, 0)
        #print(pred.size())
        pred = torch.argmax(pred, dim=2)
        #print(pred.size())
        pred = pred.permute(1, 0)
        #print(pred.size())
        break

    text = text.permute(1, 0)

    attn = attn.permute(1, 0, 2)
    for i in range(len(text[-1])):
        if text[-1][i] == 0:
            text_stop = i
            break
    for i in range(len(pred[-1])):
        if pred[-1][i] == 2:
            pred_stop = i
            break
    input = text[-1][0:text_stop]
    attention = attn[-1][1:pred_stop + 1, 0:text_stop]
    output = pred[-1][1:pred_stop + 1]
    showAttention([embedding.vocab[t] for t in input],
                  [embedding.vocab[t] for t in output], attention)