vocab = Vocab(wl_th=None, wcutoff=5) vocab.build(filename, idf_file=idf_file, firstline=False, limit=100000) word2idx = vocab.wd2idx(vocab_words=vocab.w2i, unk_words=True, se_words=False) train_data = Txtfile(filename, firstline=False, word2idx=word2idx, limit=100000) batch_size = 8 neg_sampling = 5 no_chunks = batch_size * (neg_sampling + 1) train_iters = Vocab.minibatches(train_data, batch_size=no_chunks) data = [] label = [] for inp_ids in train_iters: padded_inp, _ = seqPAD.pad_sequences(inp_ids, pad_tok=vocab.w2i[PADt]) data_tensor = Data2tensor.idx2tensor(padded_inp) # shuffle chunks perm_ids = torch.randperm(no_chunks) data_tensor = data_tensor[perm_ids] data_tensor = data_tensor.view(batch_size, neg_sampling + 1, -1) inp_tensor = data_tensor[:, 0, :] noise_tensor = data_tensor[:, 1:, :] break emb_size = len(vocab.w2i) emb_dim = 100
return label_prob, label_pred if __name__ == "__main__": from data_utils import Data2tensor, Vocab, seqPAD, Csvfile filename = "/media/data/langID/small_scale/train.csv" vocab = Vocab(cl_th=None, cutoff=1, c_lower=False, c_norm=False) vocab.build([filename], firstline=False) word2idx = vocab.wd2idx(vocab.c2i) tag2idx = vocab.tag2idx(vocab.l2i) train_data = Csvfile(filename, firstline=False, word2idx=word2idx, tag2idx=tag2idx) train_iters = Vocab.minibatches(train_data, batch_size=10) data = [] label_ids = [] for words, labels in train_iters: data.append(words) label_ids.append(labels) word_ids, sequence_lengths = seqPAD.pad_sequences(words, pad_tok=0, wthres=1024, cthres=32) w_tensor = Data2tensor.idx2tensor(word_ids) y_tensor = Data2tensor.idx2tensor(labels) data_tensors = Data2tensor.sort_tensors(labels, word_ids, sequence_lengths) label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
from data_utils import Vocab, Txtfile, Data2tensor, seqPAD, PAD cutoff = 5 wl_th = -1 batch_size = 16 bptt = 10 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data_files = ["../dataset/train.txt"] vocab = Vocab(wl_th=wl_th, cutoff=cutoff) vocab.build(data_files, firstline=False) word2idx = vocab.wd2idx(vocab.w2i) label2idx = vocab.tag2idx(vocab.l2i) train_data = Txtfile(data_files[0], firstline=False, source2idx=word2idx, label2idx=label2idx) # train_data = [sent[0] for sent in train_data] train_batch = vocab.minibatches(train_data, batch_size=batch_size) inpdata=[] outdata=[] for sent in train_batch: word_pad_ids, seq_lens = seqPAD.pad_sequences(sent, pad_tok=vocab.w2i[PAD]) data_tensor = Data2tensor.idx2tensor(word_pad_ids) for i in range(0, data_tensor.size(1)-1, bptt): data, target = vocab.bptt_batch(data_tensor, i, bptt) inpdata.append(data) outdata.append(target) break rnn_type = "GRU" ntoken = len(vocab.w2i) ninp = 32 nhid = 64