def create_dataset(path_to_dataset,batch_size,split_ratio=0.7,min_vocab_freq=10,max_vocab_size=4000):
	text_field = Field(tokenize="spacy",tokenizer_language="en",batch_first=True,init_token="<sos>",eos_token="<eos>",lower=True)

	def transform(caption):
		caption = caption.strip().lower().split()
		return caption

	dataset = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_train2014.json"),text_field=text_field,transform=transform)
	train,val = dataset.split(split_ratio=split_ratio)
	test = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_val2014.json"),text_field=text_field,transform=transform)

	print("Dataset loaded")
	print("Train set size:",len(train))

	text_field.build_vocab(dataset.text,min_freq=min_vocab_freq,max_size=max_vocab_size)
	SOS_TOKEN = text_field.vocab.stoi['<sos>']
	EOS_TOKEN = text_field.vocab.stoi['<eos>']
	UNK_TOKEN = text_field.vocab.stoi['<unk>']
	PAD_TOKEN = text_field.vocab.stoi['<pad>']

	print("Vocabuly build")

	print("Vocabuly statistics")

	print("\nMost common words in the vocabulary:\n",text_field.vocab.freqs.most_common(10))
	print("Size of the vocabulary:",len(text_field.vocab))
	print("Max sequence lenght",dataset.max_seq_len)

	train_iter,val_iter = BucketIterator.splits((train,val),repeat=False,batch_size=batch_size)
	test_iter = BucketIterator(test,batch_size=batch_size,repeat=False,train=False)
	vocab_dict = text_field.vocab.stoi
	return {"data_iters":(train_iter,val_iter,test_iter),"fields":text_field,
	"word_to_num_vocab":vocab_dict,"num_to_word_vocab":{y:x for x,y in vocab_dict.items()},
	"num_classes":len(text_field.vocab),"tokens":(SOS_TOKEN,EOS_TOKEN,UNK_TOKEN,PAD_TOKEN),"max_seq_len":dataset.max_seq_len}
Beispiel #2
0
def load_dataloaders(args):
    logger.info("Preparing dataloaders...")
    FR = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, init_token="<sos>", eos_token="<eos>",\
                              batch_first=True)
    EN = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, batch_first=True)
    
    train_path = os.path.join("./data/", "df.csv")
    if not os.path.isfile(train_path):
        tokenize_data(args)
    train = torchtext.data.TabularDataset(train_path, format="csv", \
                                             fields=[("EN", EN), ("FR", FR)])
    FR.build_vocab(train)
    EN.build_vocab(train)
    train_iter = BucketIterator(train, batch_size=args.batch_size, repeat=False, sort_key=lambda x: (len(x["EN"]), len(x["FR"])),\
                                shuffle=True, train=True)
    train_length = len(train)
    logger.info("Loaded dataloaders.")
    return train_iter, FR, EN, train_length
Beispiel #3
0
 def eval(self, test_set):
     print("========test only=======")
     test_iter = BucketIterator(test_set,
                                batch_size=self.args.batch,
                                train=False,
                                shuffle=False,
                                device=self.model.device,
                                sort_key=lambda x: len(x.TOKENS))
     with torch.no_grad():
         self.run_a_epoch("final test",
                          test_iter,
                          need_backward=False,
                          epoch_num=0,
                          save_output=os.path.join(self.args.output_path,
                                                   "check"),
                          max_step=ceil(len(test_set) / self.args.batch))
     print(self.model.argumentRoleClsLayer.arg_mask_init.tolist())
     print(self.model.argumentRoleClsLayer.arg_mask.tolist())
Beispiel #4
0
    def train(self,
              epochs,
              save_path,
              load_previous=True,
              clip=10,
              batch_size=128):
        save_dir = os.path.split(save_path)[0]
        os.makedirs(save_dir, exist_ok=True)

        if load_previous and os.path.exists(save_path):
            self._logger.debug(f'Loading model state from {save_path}')
            self.model.load_state_dict(torch.load(save_path))

        train_iterator, test_iterator = BucketIterator.splits(
            (self.train_data, self.test_data),
            batch_size=batch_size,
            device=self.device,
            sort_key=lambda x: len(x.src))

        optimizer = optim.Adam(self.model.parameters())
        trg_pad_idx = self.trg_field.vocab.stoi[WordToPhonemeModel.PAD_TOKEN]
        criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)

        # Training loop
        self._logger.debug(f'Beginning training for {epochs} epoch(s)')
        for epoch in range(epochs):
            train_loss = self._train_iter(train_iterator, optimizer, criterion,
                                          clip)
            test_loss = self._evaluate_iter(test_iterator, criterion)

            if test_loss < self.best_test_loss:
                # Save model if better
                self.best_test_loss = test_loss
                torch.save(self.model.state_dict(), save_path)

            self._logger.debug(
                f'| Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |'
            )

        # Save model
        torch.save(self.model.state_dict(), save_path)
        self._logger.info(save_path)

        return self.best_test_loss
Beispiel #5
0
def load_naive_iterators(args, path, fields):

    train, valid, test = NaiveDatasetClassification.splits(\
                    exts = (args.src_ext, args.trg_ext),
                    fields = fields,
                    root=path)

    # Some stats
    print("Stats for dataset in ", path)
    # Train
    count = Counter()
    for e in train.examples:
        count.update([e.label])
    train_d = dict(count)
    train_d['name'] = "train"

    # Val
    count = Counter()
    for e in valid.examples:
        count.update([e.label])
    val_d = dict(count)
    val_d['name'] = 'val'

    # Test
    count = Counter()
    for e in test.examples:
        count.update([e.label])
    test_d = dict(count)
    test_d['name'] = 'test'
    d_list = [train_d, val_d, test_d]
    df = data_stats.dicts_to_pandas(d_list)
    md = tabulate(df, headers='keys', tablefmt='pipe')
    print(md)

    fields[1].build_vocab(train)
    # Data iterators
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train, valid, test),
        batch_size=args.batch_size,
        sort_within_batch=True,
        sort_key=lambda x: len(x.text),
        device=args.device)

    return train_iterator, valid_iterator, test_iterator, fields[1]
Beispiel #6
0
    def create(cls, config):

        src_field = Field(init_token='<sos>',
                          eos_token='<eos>',
                          pad_token='<pad>',
                          include_lengths=True)

        trg_field = Field(init_token='<sos>',
                          eos_token='<eos>',
                          pad_token='<pad>',
                          lower=True,
                          include_lengths=True)

        train = TranslationDataset(path=config.train_prefix,
                                   exts=config.exts,
                                   fields=(src_field, trg_field))
        valid = TranslationDataset(path=config.valid_prefix,
                                   exts=config.exts,
                                   fields=(src_field, trg_field))

        test = TranslationDataset(path=config.test_prefix,
                                  exts=config.exts,
                                  fields=(src_field, trg_field))

        train_it, valid_it, test_it = BucketIterator.splits(
            [train, valid, test],
            batch_sizes=config.batch_sizes,
            sort_key=TranslationDataset.sort_key,
            device=-1)

        src_field.build_vocab(train, min_freq=10)
        trg_field.build_vocab(train, min_freq=10)

        src_voc = src_field.vocab
        trg_voc = trg_field.vocab

        model = Seq2Seq.create(src_voc, trg_voc, config)

        if config.use_cuda:
            model = model.cuda()

        return Trainer(model, train_it, valid_it, test_it, config.valid_step,
                       config.checkpoint_path, config.pool_size)
Beispiel #7
0
def load_dataset(data_dir, word_dir, batch_size, device):
    TEXT = data.Field(batch_first=True, include_lengths=True, lower=True)
    LABEL = data.LabelField(batch_first=True)
    
    fields = {'sentence1':  ('premise', TEXT),
              'sentence2':  ('hypothesis', TEXT),
              'gold_label': ('label', LABEL)}
    
    trainDataset, valDataset, testDataset = data.TabularDataset.splits(
                                                path=data_dir,
                                                format='json',
                                                train='snli_1.0_train.jsonl',
                                                validation='snli_1.0_dev.jsonl',
                                                test='snli_1.0_test.jsonl',
                                                fields=fields,
                                                filter_pred=lambda x: x.label != '-'
                                            )
    
    vectors = Vectors('glove.6B.200d.txt', word_dir)
    
    TEXT.build_vocab(trainDataset, vectors=vectors, unk_init=nn.init.xavier_uniform)
    LABEL.build_vocab(trainDataset)
    
    train_iter, val_iter = BucketIterator.splits(
                                                 datasets=(trainDataset, valDataset),
                                                 batch_sizes=(batch_size, batch_size),
                                                 device=device,
                                                 sort_key=lambda x: len(x.premise) + len(x.hypothesis),
                                                 sort_within_batch=True,
                                                 repeat=False,
                                                 shuffle=True
                                                )
    
    test_iter = Iterator(
                         dataset=testDataset,
                         batch_size=batch_size,
                         device=device,
                         sort=False,
                         repeat=False,
                         shuffle=False
                        )
    
    return TEXT, LABEL, train_iter, val_iter, test_iter
Beispiel #8
0
    def score(self, sentences: List[str]) -> np.ndarray:
        results = []

        fields = [('data', self.field)]
        examples = [Example.fromlist([s], fields) for s in sentences]
        dataset = Dataset(examples, fields)
        dataloader = BucketIterator(dataset,
                                    self.config.metrics.classifier.batch_size,
                                    repeat=False,
                                    shuffle=False,
                                    device=self.config.device_name)

        for batch in dataloader:
            scores = self.model(batch.data)
            scores = torch.sigmoid(scores)
            scores = scores.detach().cpu().numpy().tolist()
            results.extend(scores)

        return np.mean(results)
Beispiel #9
0
    def splits(cls, train, valid, batch_size=2, device="cuda"):
        train_dataset = train.get("Dataset")
        valid_dataset = valid.get("Dataset")
        field_names = [field[0] for field in train.get("Field")]
        device = device

        train_iter, valid_iter = BucketIterator.splits(
            (train_dataset, valid_dataset),
            batch_size=batch_size,
            device=device,
            sort_key=lambda x: len(vars(x)[field_names[0]]),
            sort_within_batch=True)

        train_dataloader = MiniBatchWrapper(train_iter, field_names[0],
                                            field_names[1])
        valid_dataloader = MiniBatchWrapper(valid_iter, field_names[0],
                                            field_names[1])

        return train_dataloader, valid_dataloader
    def prepare_data(self):
        self.text_field = Field(sequential=True,
                                fix_length=200,
                                include_lengths=True)
        self.label_field = LabelField()

        train_val, test = IMDB.splits(self.text_field, self.label_field)
        random.seed(42)
        train, val = train_val.split(random_state=random.getstate())

        self.text_field.build_vocab(
            train, vectors=GloVe())  #vectors=FastText('simple'))
        self.label_field.build_vocab(train)

        self.train_iter, self.test_iter, self.val_iter = BucketIterator.splits(
            (train, test, val), batch_size=self.batch_size)

        self.train_iter.sort_within_batch = True
        self.val_iter.sort_within_batch = True
Beispiel #11
0
def load_or_generate_dataset(batch_size=128):
    download_spacy_models()  # TODO check if need to download then do it
    spacy_en, spacy_de = load_tokenize_models()

    SRC = Field(tokenize=partial(tokenize_de, spacy_de=spacy_de),
                init_token=START_TOKEN,
                eos_token=END_TOKEN,
                lower=True)
    TRG = Field(tokenize=partial(tokenize_en, spacy_en=spacy_en),
                init_token=START_TOKEN,
                eos_token=END_TOKEN,
                lower=True)
    train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                        fields=(SRC, TRG))

    logging.debug(f"Number of training examples: {len(train_data.examples)}")
    logging.debug(f"Number of validation examples: {len(valid_data.examples)}")
    logging.debug(f"Number of testing examples: {len(test_data.examples)}")

    SRC.build_vocab(train_data, min_freq=2)
    TRG.build_vocab(train_data, min_freq=2)

    logging.debug(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
    logging.debug(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=batch_size,
        device=get_available_device())

    # TODO class
    return {
        'train_data': train_iterator,
        'valid_data': valid_iterator,
        'test_data': test_iterator,
        'n_src_words': len(SRC.vocab),
        'n_trg_words': len(TRG.vocab),
        'trg_pad_idx': TRG.vocab.stoi[TRG.pad_token],
        'src_vocab': SRC.vocab,
        'trg_vocab': TRG.vocab,
        'src_field': SRC,
        'trg_field': TRG,
    }
def load_iters(batch_size=1, device="cpu", data_path='data'):
    TEXT = data.Field(batch_first=True, include_lengths=True, lower=True)
    LABEL = data.LabelField(batch_first=True)
    fields = {
        'sentence1': ('premise', TEXT),
        'sentence2': ('hypothesis', TEXT),
        'gold_label': ('label', LABEL)
    }

    train_data, dev_data, test_data = data.TabularDataset.splits(
        path=data_path,
        train='snli_1.0_train.jsonl',
        validation='snli_1.0_dev.jsonl',
        test='snli_1.0_test.jsonl',
        format='json',
        fields=fields,
        filter_pred=lambda ex: ex.label !=
        '-'  # filter the example which label is '-'(means unlabeled)
    )

    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train_data)
    # TEXT.build_vocab(train_data, vectors=vectors, unk_init=torch.Tensor.normal_)

    train_iter, dev_iter = BucketIterator.splits(
        (train_data, dev_data),
        batch_sizes=(batch_size, batch_size),
        device=device,
        sort_key=lambda x: len(x.premise) + len(x.hypothesis),  #按 合并文本长度 排序,
        sort_within_batch=True,  #方便后面pytorch lstm进行pack和pad
        repeat=False,
        shuffle=True)

    test_iter = Iterator(test_data,
                         batch_size=batch_size,
                         device=device,
                         sort=False,
                         sort_within_batch=False,
                         repeat=False,
                         shuffle=False)

    return train_iter, dev_iter, test_iter, TEXT, LABEL
Beispiel #13
0
def init(model_config, device='cpu'):
    logging.critical("[CRITICAL] %s device is selected" % device)
    logging.info('[INFO] Using directory %s for the translation pair with filename %s' % (os.path.abspath(model_config['global']['dataset_path']), model_config['global']['translate_pair']))
    #initialize the field for src language
    src_field = Field(tokenize = english_tokenizer, 
                init_token = '<sos>', 
                eos_token = '<eos>', 
                lower = True)
    #initialize the field for trg language
    trg_field = Field(tokenize = hindi_tokenizer, 
                init_token = '<sos>', 
                eos_token = '<eos>', 
                lower = True)
    train_data, valid_data, test_data = load_datasets(model_config['global']['dataset_path'], model_config['global']['dataset_file_names'], model_config['global']['translate_pair'], model_config['global']['lang_extensions'], [src_field, trg_field])
    #initialize the vocabulary
    src_field.build_vocab(train_data, min_freq = 1)
    trg_field.build_vocab(train_data, min_freq = 1)
    #display dataset stats
    print_dataset_statistics(train_data, valid_data, test_data, model_config['global']['lang_extensions'], [src_field, trg_field])
    model = create_seq2seq_model(model_config, len(src_field.vocab), len(trg_field.vocab), device)
    optimizer = optim.Adam(model.parameters())
    #defining the loss function
    loss_function = nn.CrossEntropyLoss(ignore_index = trg_field.vocab.stoi[trg_field.pad_token])

    logging.info(model.apply(init_weights))
    logging.info('[INFO] Model has %s trainable parameters' % (count_parameters(model)))
    logging.info('[INFO] About to start the primary training loop')
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data), 
        batch_size = model_config['global']['batch_size'], 
        device = device)
    cache_file_name = "%s-%s-%s-epoch-%s.pt" % (model_config['global']['name'], model_config['global']['lang_extensions'][0], model_config['global']['lang_extensions'][1], model_config['global']['epochs'])
    cache_file_path = os.path.join(model_config['global']['cache_path'], cache_file_name)
    stats = execute_training_loop(model, train_iterator, valid_iterator, loss_function, optimizer, model_config['global']['clip_value'], src_field, trg_field, epochs=model_config['global']['epochs'], model_cache_path=os.path.abspath(cache_file_path))
    
    stats_file_name = "%s-%s-%s-epoch-%s-stats.pickle" % (model_config['global']['name'], model_config['global']['lang_extensions'][0], model_config['global']['lang_extensions'][1], model_config['global']['epochs'])
    store_object(stats, os.path.join(model_config['global']['cache_path'], stats_file_name))

    logging.info("[INFO] loading the model %s" % (cache_file_name))
    model.load_state_dict(torch.load(os.path.abspath(cache_file_path)))
    test_loss, test_bleu = evaluate_model(model, test_iterator, loss_function, src_field, trg_field)
    logging.info(f'[INFO] | Test Loss: {test_loss:.3f} Test Bleu: {test_bleu:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')
Beispiel #14
0
def main():
    train, test, field = dataset_reader(train=True, stop=900000)
    evl, _ = dataset_reader(train=False, fields=field)
    field.build_vocab(train, evl)
    _, evl_iter = BucketIterator.splits((train, evl),
                                        batch_sizes=(1024, 1024),
                                        device=device,
                                        sort_within_batch=False,
                                        repeat=False,
                                        sort=False)

    model = RNNCNN(num_embeddings=len(field.vocab),
                   embedding_dim=256).to(device)
    model.load_state_dict(torch.load('model/rnn_cnn_model.pkl'))
    with open('data/rnn_cnn_result.txt', 'w+') as f:
        f.write('')

    model.eval()
    with torch.no_grad():
        for i, data in tqdm.tqdm(enumerate(evl_iter),
                                 total=evl_iter.__len__()):
            inputs = torch.cat((
                data.plat_form,
                data.biz_type,
                data.payed_day,
                data.payed_hour,
                data.cate1_id,
                data.cate2_id,
                data.cate3_id,
                data.preselling_shipped_day,
                data.seller_uid_field,
                data.company_name,
                data.rvcr_prov_name,
                data.rvcr_city_name,
            ),
                               dim=1)
            outputs = model(inputs, 'test', field)
            day = outputs * 3 + 3
            with open('data/rnn_cnn_result.txt', 'a+') as f:
                for b in range(day.size(0)):
                    sign_day = str(float(day[b]))
                    f.write(sign_day + '\n')
Beispiel #15
0
def main():
    setup_seed(2020)
    POST = Field(tokenize=tokenize_post, init_token='<sos>', eos_token='<eos>')
    QUERY = Field(tokenize=tokenize_query,
                  init_token='<sos>',
                  eos_token='<eos>')
    RESP = Field(tokenize=tokenize_resp, init_token='<sos>', eos_token='<eos>')
    # first 'post' is the key in loaded json, second 'post' is the key in batch
    fields = {
        'post': ('post', POST),
        'query': ('query', QUERY),
        'resp': ('resp', RESP)
    }
    train_data, valid_data, test_data = TabularDataset.splits(
        path='./data',
        train='train.json',
        validation='valid.json',
        test='test.json',
        format='json',
        fields=fields)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    POST.build_vocab(train_data, min_freq=1)
    QUERY.build_vocab(train_data, min_freq=1)
    RESP.build_vocab(train_data, min_freq=1)
    # print(POST.vocab.__dict__)

    batch_size = 10
    train_iter, val_iter, test_iter = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_sizes=(batch_size, batch_size, batch_size),
        device=device,
        sort_key=lambda x: len(x.post),
        sort_within_batch=True,
        # sort according to the len, for padding in LSTM
        repeat=False)
    cnt = 0
    for i, batch in enumerate(train_iter):
        if cnt == 0:
            post = batch.post
            print(post.size())
            print(batch.post)
        cnt += 1
Beispiel #16
0
def eval_dataset(model,
                 dataset,
                 batch_size,
                 loss_fn,
                 device,
                 text_embeds,
                 optimizer,
                 stage,
                 csv_file,
                 update_grad=False):
    cols = ['loss', 'acc']
    (iterator, ) = BucketIterator.splits(datasets=(dataset, ),
                                         batch_sizes=[batch_size],
                                         device=device,
                                         shuffle=True)
    metrics = []
    for batch in iterator:
        (prem_embeds, prem_lens, hyp_embeds, hyp_lens,
         labels) = batch_cols(batch, text_embeds)
        predictions = model.forward(prem_embeds, prem_lens, hyp_embeds,
                                    hyp_lens)
        loss = loss_fn(predictions, labels)
        acc = accuracy(predictions, labels)
        vals = [loss, acc]
        stats = get_stats(cols, vals)
        metrics.append(stats)
        print(
            yaml.dump({
                stage: {
                    k: round(i, 3) if isinstance(i, float) else i
                    for k, i in stats.items()
                }
            }))
        if update_grad:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    df = pd.DataFrame(metrics, columns=cols)
    df.to_csv(csv_file)
    (loss, acc) = list(df.mean())
    return (loss, acc, df)
Beispiel #17
0
    def _set_data(self, dataset: str, batch_size: int):
        train_ds, valid_ds, test_ds, TEXT = load_real_dataset(dataset)

        self._reverse = lambda x: TEXT.reverse(x)

        self.first_token = TEXT.vocab.stoi["<sos>"]
        self.vocab_size = len(TEXT.vocab)
        self.length = TEXT.max_length
        self.train_batchmanager, self.val_batchmanager, self.test_batchmanager = BucketIterator.splits(
            (train_ds, valid_ds, test_ds),
            batch_sizes=(batch_size, 2 * batch_size, 2 * batch_size),
            device="cpu",
            sort_key=lambda x: len(x.text),
            sort_within_batch=False,
            repeat=False)

        if self.evaluate_test:
            self.eval_test = MetricsEval(test_ds, TEXT, "Test")
        if self.evaluate_valid:
            self.eval_valid = MetricsEval(valid_ds, TEXT, "Valid")
Beispiel #18
0
 def __init__(self, data_path, batch_size, device, embedding=None):
     self.TEXT = Field(sequential=True, tokenize=self.tokenize, lower=True, include_lengths = True)
     self.LABEL = Field(sequential=False, use_vocab=False)
     self.datafield = [("id", None), ("sentence", self.TEXT), ("label", self.LABEL)]
     train_data, dev_data_a, test_data_a = TabularDataset.splits(path=data_path, train='train.csv',
                                                                 validation="dev_a.csv", test="test_a.csv",
                                                                 format='csv', skip_header=True, fields=self.datafield)
     dev_data_b, test_data_b = TabularDataset.splits(path=data_path, validation="dev_b.csv", test="test_b.csv",
                                                     format='csv', skip_header=True, fields=self.datafield)
     self.TEXT.build_vocab(train_data)
     if embedding:
         self.TEXT.vocab.load_vectors(embedding)
         self.embedding = self.TEXT.vocab.vectors.to(device)
     else:
         self.embedding = None
     # self.train_iter, self.val_iter = BucketIterator(train_data, batch_size=batch_size, device=device,
     #                                                 sort_key=lambda x: len(x.sentence), sort_within_batch=True)
     self.train_iter, self.dev_a_iter, self.test_a_iter, self.dev_b_iter, self.test_b_iter = \
         BucketIterator.splits((train_data, dev_data_a, test_data_a, dev_data_b, test_data_b), batch_size=batch_size,
                               sort_within_batch=True, sort_key=lambda x: len(x.sentence), device=device)
Beispiel #19
0
    def predict(self, input_text):
        input_fields = [('id', self.ID), ('text', self.TEXT),
                        ('label', self.LABEL)]
        input_example = Example()
        input_example = input_example.fromlist([1, input_text, 1],
                                               input_fields)
        input_dataset = Dataset([input_example], input_fields)
        input_iter = BucketIterator(input_dataset,
                                    batch_size=1,
                                    device=self.device,
                                    repeat=False)

        pred, attn, prob = test_model(self.encoder, self.classifier, 1,
                                      input_iter)

        pred = pred[0][1]
        prob = prob[0][1]
        attn = attn[0].tolist()
        attn = [i[0] for i in attn]
        return pred, prob, attn
Beispiel #20
0
def get_iterator(datafields, data_dir, bs):
    trn, vld = TabularDataset.splits(
        path=data_dir,
        train='train.csv', validation="val.csv",
        format='csv',
        skip_header=True,
        fields=datafields)
    _, label_field = datafields[0]
    _, text_field = datafields[1]

    text_field.build_vocab(trn, vectors="glove.6B.100d")
    label_field.build_vocab(trn)

    train_iterator, valid_iterator = BucketIterator.splits(
        (trn, vld),
        batch_size=bs,
        sort_key=lambda x: len(x.text),
        device=device)

    return train_iterator, valid_iterator
def load_iters(batch_size=32, device="cpu", data_path='data', vectors=None):
    TEXT = data.Field(lower=True, batch_first=True, include_lengths=True)
    LABEL = data.LabelField(batch_first=True)
    train_fields = [(None, None), (None, None), ('text', TEXT),
                    ('label', LABEL)]
    test_fields = [(None, None), (None, None), ('text', TEXT)]

    train_data = data.TabularDataset.splits(
        path=data_path,
        train='train.tsv',
        format='tsv',
        fields=train_fields,
        skip_header=True)[0]  # return is a tuple.

    test_data = data.TabularDataset.splits(path='data',
                                           train='test.tsv',
                                           format='tsv',
                                           fields=test_fields,
                                           skip_header=True)[0]

    TEXT.build_vocab(train_data.text, vectors=vectors)
    LABEL.build_vocab(train_data.label)
    train_data, dev_data = train_data.split([0.8, 0.2])

    train_iter, dev_iter = BucketIterator.splits(
        (train_data, dev_data),
        batch_sizes=(batch_size, batch_size),
        device=device,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        repeat=False,
        shuffle=True)

    test_iter = Iterator(test_data,
                         batch_size=batch_size,
                         device=device,
                         sort=False,
                         sort_within_batch=False,
                         repeat=False,
                         shuffle=False)
    return train_iter, dev_iter, test_iter, TEXT, LABEL
Beispiel #22
0
def getdata():
    europarl_en = open('europarl-v7.cs-en.en',
                       encoding='utf-8').read().split('\n')
    europarl_cs = open('europarl-v7.cs-en.cs',
                       encoding='utf-8').read().split('\n')

    EN_TEXT = Field(tokenize=tokenize_en)
    CS_TEXT = Field(tokenize=tokenize_cs,
                    init_token="<sos>",
                    eos_token="<eos>")

    raw_data = {
        'English': [line for line in europarl_en],
        'Czech': [line for line in europarl_cs]
    }
    df = pd.DataFrame(raw_data, columns=["English", "Czech"])
    # remove very long sentences and sentences where translations are
    df['en_len'] = df['English'].str.count(' ')
    df['cs_len'] = df['Czech'].str.count(' ')
    df = df.query('cs_len < 80 & en_len < 80')
    df = df.query('cs_len < en_len * 1.5 & cs_len * 1.5 > en_len')
    train, val = train_test_split(df, test_size=0.1)
    train.to_csv("train.csv", index=False)
    val.to_csv("val.csv", index=False)
    data_fields = [('English', EN_TEXT), ('Czech', CS_TEXT)]
    train, val = TabularDataset.splits(path='./',
                                       train='train.csv',
                                       validation='val.csv',
                                       format='csv',
                                       fields=data_fields)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    EN_TEXT.build_vocab(train, min_freq=2)
    CS_TEXT.build_vocab(train, min_freq=2)
    BATCH_SIZE = 16
    INPUT_DIM = len(EN_TEXT.vocab)
    OUTPUT_DIM = len(CS_TEXT.vocab)
    PAD_IDX = EN_TEXT.vocab.stoi['<pad>']

    train_iterator, valid_iterator = BucketIterator.splits(
        (train, val), batch_size=BATCH_SIZE, device=device)
    return train_iterator, valid_iterator, INPUT_DIM, OUTPUT_DIM, PAD_IDX
def load_dataset(db_name, batch_size):
    """
    Load the csv datasets into torchtext files

    Inputs:
    db_name (string)
       The name of the dataset. This name must correspond to the folder name.
    batch_size
       The batch size
    """
    print "Loading " + db_name + "..."
    i = 1
    print('num', i)

    tokenize = lambda x: x.split()
    TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
    LABEL = Field(sequential=False, use_vocab=False)

    tv_datafields = [("sentence", TEXT), ("label", LABEL)]

    trn, vld = TabularDataset.splits(
        path=DATA_ROOT + db_name,  # the root directory where the data lies
        train='train.csv',
        validation="test.csv",
        format='csv',
        skip_header=False,
        fields=tv_datafields)

    TEXT.build_vocab(trn)

    print "vocab size: %i" % len(TEXT.vocab)

    train_iter, val_iter = BucketIterator.splits(
        (trn, vld),
        batch_sizes=(batch_size, batch_size),
        device=-1,  # specify dont use gpu
        sort_key=lambda x: len(x.sentence),  # sort the sentences by length
        sort_within_batch=False,
        repeat=False)

    return train_iter, val_iter, len(TEXT.vocab)
Beispiel #24
0
def get_iterators(opt, device=None):
    """
    Get dataset iterator and necessary fields information
    :param opt: opt from argparser.
    :param device: device to create the data.
    :return: train_iter, test_iter, dataset.fields
    """
    import random
    random.seed(42)
    dataset = load_dataset(opt.use_cws)
    dataset.fields["text"].build_vocab(dataset)
    dataset.fields["author"].build_vocab(dataset)
    dataset.fields["book"].build_vocab(dataset)
    train, test = dataset.split(split_ratio=0.7)
    train_iter, test_iter = BucketIterator.splits(
        (train, test),  # first one is default to train (shuffle each epoch)
        batch_sizes=(opt.train_batch_size, opt.eval_batch_size),
        device=device,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True)
    return train_iter, test_iter, dataset.fields
Beispiel #25
0
def load_dataset(batch_size):
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')
    url = re.compile('(<url>.*</url>)')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    DE = Field(tokenize=tokenize_de, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    EN = Field(tokenize=tokenize_en, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))
    DE.build_vocab(train.src, min_freq=2)
    EN.build_vocab(train.trg, max_size=10000)
    train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=batch_size, repeat=False)
    return train_iter, val_iter, test_iter, DE, EN
Beispiel #26
0
def create_iterator(
    corpus: Corpus,
    batch_size: int = 1,
    device: str = "cpu",
    sort: Optional[bool] = None,
) -> BucketIterator:
    if sort is not None:
        sort_key: Optional[Callable] = lambda e: len(e.sen)
    else:
        sort_key = None

    iterator = BucketIterator(
        dataset=corpus,
        batch_size=batch_size,
        device=device,
        shuffle=False,
        sort=sort,
        sort_key=sort_key,
    )

    return iterator
def data_iter(train_data, valid_data, TEXT, LABEL):
    train = MyDataset(train_data,
                      text_field=TEXT,
                      label_field=LABEL,
                      test=False)
    valid = MyDataset(valid_data,
                      text_field=TEXT,
                      label_field=LABEL,
                      test=False)

    TEXT.build_vocab(train)
    train_iter, val_iter = BucketIterator.splits(
        (train, valid),  # 构建数据集所需的数据集
        batch_sizes=(100, 100),
        # 如果使用gpu,此处将-1更换为GPU的编号
        device=device,
        # the BucketIterator needs to be told what function it should use to group the data.
        sort_key=lambda x: len(x.text),
        sort_within_batch=False,
        repeat=False)
    return train_iter, val_iter
def load_dataset(batch_size, device):
    """
    Load the dataset from the files into iterator and initialize the vocabulary
    :param batch_size
    :param device
    :return: source and data iterators
    """
    source = Field(tokenize=tokenize_en,
                   init_token='<sos>',
                   eos_token='<eos>',
                   lower=True)

    train_data, valid_data, test_data = TranslationDataset.splits(
        path=DATA_FOLDER,
        exts=(POSITIVE_FILE_EXTENSION, NEGATIVE_FILE_EXTENSION),
        fields=(source, source))
    source.build_vocab(train_data, min_freq=5)
    return source, BucketIterator.splits((train_data, valid_data, test_data),
                                         shuffle=True,
                                         batch_size=batch_size,
                                         device=device)
Beispiel #29
0
def load_dataset(batch_size):
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')
    url = re.compile('(<url>.*</url>)')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    DE = Field(tokenize=tokenize_de, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    EN = Field(tokenize=tokenize_en, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))
    DE.build_vocab(train.src, min_freq=2)
    EN.build_vocab(train.trg, max_size=10000)
    train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=batch_size, repeat=False)
    return train_iter, val_iter, test_iter, DE, EN
Beispiel #30
0
def get_data_iterator(batch_size, device):

	SRC = Field(tokenize = tokenize_de,
           init_token = '<sos>',
           eos_token = '<eos>',
           lower = True)
	TRG = Field(tokenize = tokenize_en,
            init_token = '<sos>',
            eos_token = '<eos',
            lower = True)

	train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (SRC, TRG)) 
	SRC.build_vocab(train_data, min_freq = 2)
	TRG.build_vocab(train_data, min_freq = 2)

	train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
	    (train_data, valid_data, test_data),
	    batch_size = batch_size,
	    device = device)

	return train_iterator, valid_iterator, test_iterator, SRC, TRG
Beispiel #31
0
def load_dataset(batch_size, device=0):
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(text)]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    DE = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>')
    EN = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>')

    train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))

    DE.build_vocab(train.src)
    EN.build_vocab(train.trg)

    train_iter, val_iter, test_iter = BucketIterator.splits(
        (train, val, test), batch_size=batch_size, device=device, repeat=False)
    return train_iter, val_iter, test_iter, DE, EN
Beispiel #32
0
def load_dataset(batch_size, debug=True, shuffle_dataset=True):
    spacy_en = spacy.load('en')

    def tokenize_en(line):
        return [token.text for token in spacy_en.tokenizer(line)]

    def tokenize_zh(line):
        return [token for token in jieba.cut(line)]

    EN = Field(tokenize=tokenize_en,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')
    ZH = Field(tokenize=tokenize_zh,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')

    exts = ['.en', '.zh']
    fields = [('src', EN), ('trg', ZH)]
    train_dataset = TranslationDataset(train_sentence_path,
                                       exts=exts,
                                       fields=fields)
    val_dataset = TranslationDataset(val_sentence_path,
                                     exts=exts,
                                     fields=fields)
    print('Datasets Built!')

    EN.build_vocab(train_dataset.src, min_freq=2)
    ZH.build_vocab(train_dataset.trg, max_size=100000)
    print('Vocabularies Built!')

    val_iter, *_ = BucketIterator.splits(
        (val_dataset, ),
        shuffle=shuffle_dataset,
        batch_size=batch_size,
        repeat=False,
        sort_key=lambda x: interleave_keys(len(x.src), len(x.trg)))
    print('Training Iterators Built!')
    return val_iter, val_dataset, ZH, EN
# multi30k dataloader
train,val,test = datasets.Multi30k.splits(exts=(".en",".de"),fields=(EN,DE),root=data_path)

# wmt14 dataloader (better than using datasets.WMT14.splits since it's slow)
#train,val,test = datasets.TranslationDataset.splits(exts=(".en",".de"),fields=[("src",EN),("trg",DE)],path=os.path.join(data_path,"wmt14"),
#	train="train.tok.clean.bpe.32000",validation="newstest2013.tok.bpe.32000",test="newstest2014.tok.bpe.32000")

print("Dataset loaded")

EN.build_vocab(train.src,min_freq=3)
DE.build_vocab(train.trg,max_size=50000)

print("Vocabularies build")

train_iter,val_iter = BucketIterator.splits((train, val),batch_size=3)
test_iter = BucketIterator(test,batch_size=3)

print("Start iterating through data")

for i,batch in enumerate(train_iter):
	print(batch.src) # the source language
	print(batch.trg) # the target language
	break

for i,batch in enumerate(val_iter):
	print(batch.src) # the source language
	print(batch.trg) # the target language
	break

for i,batch in enumerate(test_iter):
 skip_header=True, 
 fields=[('id',None),('text',TEXT),('label',LABEL)], 
 filter_pred = lambda x: True if len(x.text) > 1 else False) 
# 토큰 레벨 문장의 길이가 1 이상인 경우만 허용

TEXT.build_vocab(train_data,min_freq=2)
LABEL.build_vocab(train_data)

# print (TEXT.vocab)
# print (len(TEXT.vocab),len(LABEL.vocab))

# print (TEXT.vocab.itos[:5])
# print (LABEL.vocab.itos)

train_loader, test_loader = BucketIterator.splits((train_data,test_data),sort_key=lambda x:len(x.text), sort_within_batch=True,
 repeat=False,shuffle=True,
 batch_size=32,device=DEVICE)

for batch in train_loader:
    
    break

EPOCH = 5
BATCH_SIZE = 32
EMBED = 300
KERNEL_SIZES = [3,4,5]
KERNEL_DIM = 100
LR = 0.001

# model = CNNClassifier(len(TEXT.vocab), EMBED, 1, KERNEL_DIM, KERNEL_SIZES)
model = RNN(len(TEXT.vocab), EMBED, KERNEL_DIM, 1, bidirec=False)