Beispiel #1
0
 def _load_data(self):
     # Read file and tokenize
     self.train_data = LanguageModelingDataset(self.train_file, self.TEXT)
     if self.valid_file:
         self.valid_data = LanguageModelingDataset(self.valid_file, self.TEXT)
     else:
         self.valid_data = None
     if self.test_file:
         self.test_data = LanguageModelingDataset(self.test_file, self.TEXT)
     else:
         self.test_data = None
def generate_data(config):
    ## 不同字段的操作定义
    path_train = config.data_ori + config.train_path
    path_valid = config.data_ori + config.valid_path
    path_test = config.data_ori + config.test_path

    tokenizer = lambda x: [one for one in x]

    TEXT = Field(batch_first=False, tokenize=tokenizer)
    train = LanguageModelingDataset(path=path_train, text_field=TEXT)
    valid = LanguageModelingDataset(path=path_valid, text_field=TEXT)
    test = LanguageModelingDataset(path=path_test, text_field=TEXT)

    config.train_len=len(train)
    print("example len::  ", len(train.examples))
    print("train_examples[0] len: ", len(train.examples[0].text))
    print("valid_examples[0] len: ", len(valid.examples[0].text))

    train_iter, valid_iter, test_iter=data.BPTTIterator.splits(
        (train,valid, test),
        batch_size=config.batch_size,
        bptt_len=50,
        device=config.device
    )
    #TEXT.build_vocab(train)
    vectors=Vectors(name=config.data_ori+config.embedding_path,cache="./")
    TEXT.build_vocab(train,max_size=config.vocab_maxsize, min_freq=config.vocab_minfreq, vectors=vectors)
    TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)

    config.train_len=len(train.examples[0].text)
    config.test_len=len(test.examples[0].text)
    config.valid_len=len(valid.examples[0].text)

    print("词汇量: ", len(TEXT.vocab))

    return train_iter, valid_iter, test_iter, TEXT
Beispiel #3
0
def main():

   TEXT = Field(sequential=True, tokenize=tokenizer2, lower=True)
   lang = LanguageModelingDataset(path='./dataset/prideand.txt', text_field=TEXT)
   TEXT.build_vocab(lang, min_freq=3)
   vocab = TEXT.vocab
   vocab.load_vectors('glove.6B.100d')
   vocab_size = vocab.vectors.shape[0]

   device = torch.device('cuda' if GPU else 'cpu')
   model = LSTMLM(vocab_size, EMBEDDING_DIM, NUM_LAYERS, HIDDEN_DIM, GPU).to(device)
   optimizer = optim.SGD(model.parameters(), lr=0.01)
   model.init_hidden(128)

   model.set_embed_parameter(vocab.vectors)

   train_loader = BPTTIterator(dataset=lang, batch_size=BATCH_SIZE, bptt_len=30)
   train(EPOCH_NUM, model, device, train_loader, optimizer, vocab_size)
Beispiel #4
0
def main():
    device = torch.device("cuda" if GPU else "cpu")

    TEXT = Field(sequential=True,
                 tokenize=tokenizer2,
                 lower=True,
                 stop_words=['<eos>'])
    lang = LanguageModelingDataset(path='./iapr_tags.txt', text_field=TEXT)
    TEXT.build_vocab(lang)
    vocab = TEXT.vocab
    vocab_size = len(vocab.freqs) + 1
    #print(vocab.itos)
    #print(vocab.stoi)

    train_data = DcmhDataset('./train_saiapr_mini.csv', vocab.stoi, vocab_size)
    test_data = DcmhDataset('./test_saiapr.csv', vocab.stoi, vocab_size)

    img_model = CNNModel(IMG_SIZE, HASH_CODR_LENGTH).to(device)
    text_model = TextModel(vocab_size, HASH_CODR_LENGTH).to(device)

    train(img_model, text_model, train_data, vocab_size, device)

    img_model.save('model/img_model.t7')
    text_model.save('model/text_model.t7')
Beispiel #5
0
        output, hidden= model(text)

        # pytorch currently only supports cross entropy loss for inputs of 2 or 4 dimensions.
        # we therefore flatten the predictions out across the batch axis so that it becomes
        # shape (batch_size * sequence_length, n_tokens)
        # in accordance to this, we reshape the targets to be
        # shape (batch_size * sequence_length)
        loss = criterion(output.view(-1, ntokens), targets.view(-1))
        loss.backward()
        optimizer.step()

        #torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)


TEXT = data.Field(lower=True, tokenize=spacy_tok)
dataset = LanguageModelingDataset(path_to_data, TEXT)
TEXT.build_vocab(dataset, vectors="glove.6B.200d")

#train, test = dataset.split()
train_it = BPTTIterator(dataset, batch_size, bptt_len)
#train_it = BPTTIterator(train, batch_size, bptt_len)
#test_it = BucketIterator(test)

ntokens = len(TEXT.vocab)
model = languagemodel.LanguageModel(200, 400, ntokens, 40, 0.1)

# Instantiate word embeddings.
weight_matrix = TEXT.vocab.vectors
model.encoder.weight.data.copy_(weight_matrix)

criterion = nn.CrossEntropyLoss()
# split and write lyrics.txt to train.txt and valid.txt
lyrics_df = pd.read_csv("lyrics.txt",
                        header=None,
                        sep="\n",
                        names=None,
                        index_col=None)
train, valid = train_test_split(lyrics_df,
                                test_size=0.3,
                                random_state=SEED,
                                shuffle=False)
train.to_csv("train.txt", index=None, header=None, sep="\n")
valid.to_csv("valid.txt", index=None, header=None, sep="\n")

# make datasets from train and test txt files
train_dataset = LanguageModelingDataset("train.txt", TEXT)
valid_dataset = LanguageModelingDataset("valid.txt", TEXT)

# build vocab from train set
TEXT.build_vocab(train_dataset)

# make iters from datasets
batch_sz = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iter, valid_iter = BPTTIterator.splits(
    (train_dataset, valid_dataset),
    batch_sizes=(batch_sz, batch_sz * 2),
    bptt_len=30,
    device=device,
    repeat=False)
Beispiel #7
0
 def get_dataset(self, path: str, field=TEXT, newline_eos=False):
     logger.info('loading dataset from {}'.format(path))
     lm_dataset = LanguageModelingDataset(path, text_field=field, newline_eos=newline_eos)
     logger.info('successed loading dataset')
     return lm_dataset
Beispiel #8
0
from torchtext.datasets import LanguageModelingDataset
import spacy
spacy_en = spacy.load('en')


def tokenizer2(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]


if __name__ == '__main__':
    W2V = Field()
    TEXT = Field(sequential=True,
                 tokenize=tokenizer2,
                 lower=True,
                 is_target=False)
    lang = LanguageModelingDataset(path='./dataset/prideand.txt',
                                   text_field=TEXT)
    print(lang[0].text[:10])
    TEXT.build_vocab(lang, min_freq=3)
    vocab = TEXT.vocab
    vocab.load_vectors('glove.6B.100d')
    counter = vocab.freqs
    print(vocab.vectors.shape)
    train_loader = BPTTIterator(dataset=lang, batch_size=64, bptt_len=50)
    train_loader.create_batches()
    for idx, data_ in enumerate(train_loader):
        if idx == 1: break
        print(idx)
        print(data_.text[1])
        for i in data_.text[1]:
            print(vocab.itos[i], end=' ')
        print(data_.target[0])