vocab = Vocab(wl_th=None, wcutoff=5)
    vocab.build(filename, idf_file=idf_file, firstline=False, limit=100000)

    word2idx = vocab.wd2idx(vocab_words=vocab.w2i,
                            unk_words=True,
                            se_words=False)

    train_data = Txtfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         limit=100000)

    batch_size = 8
    neg_sampling = 5
    no_chunks = batch_size * (neg_sampling + 1)
    train_iters = Vocab.minibatches(train_data, batch_size=no_chunks)
    data = []
    label = []
    for inp_ids in train_iters:
        padded_inp, _ = seqPAD.pad_sequences(inp_ids, pad_tok=vocab.w2i[PADt])
        data_tensor = Data2tensor.idx2tensor(padded_inp)
        # shuffle chunks
        perm_ids = torch.randperm(no_chunks)
        data_tensor = data_tensor[perm_ids]
        data_tensor = data_tensor.view(batch_size, neg_sampling + 1, -1)
        inp_tensor = data_tensor[:, 0, :]
        noise_tensor = data_tensor[:, 1:, :]
        break

    emb_size = len(vocab.w2i)
    emb_dim = 100
Exemple #2
0
        return label_prob, label_pred


if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, Csvfile
    filename = "/media/data/langID/small_scale/train.csv"
    vocab = Vocab(cl_th=None, cutoff=1, c_lower=False, c_norm=False)
    vocab.build([filename], firstline=False)
    word2idx = vocab.wd2idx(vocab.c2i)
    tag2idx = vocab.tag2idx(vocab.l2i)
    train_data = Csvfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         tag2idx=tag2idx)

    train_iters = Vocab.minibatches(train_data, batch_size=10)
    data = []
    label_ids = []
    for words, labels in train_iters:
        data.append(words)
        label_ids.append(labels)
        word_ids, sequence_lengths = seqPAD.pad_sequences(words,
                                                          pad_tok=0,
                                                          wthres=1024,
                                                          cthres=32)

    w_tensor = Data2tensor.idx2tensor(word_ids)
    y_tensor = Data2tensor.idx2tensor(labels)

    data_tensors = Data2tensor.sort_tensors(labels, word_ids, sequence_lengths)
    label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
Exemple #3
0
    from data_utils import Vocab, Txtfile, Data2tensor, seqPAD, PAD
    cutoff = 5
    wl_th = -1
    batch_size = 16
    bptt = 10

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data_files = ["../dataset/train.txt"]
    vocab = Vocab(wl_th=wl_th, cutoff=cutoff)
    vocab.build(data_files, firstline=False)
    word2idx = vocab.wd2idx(vocab.w2i)
    label2idx = vocab.tag2idx(vocab.l2i)

    train_data = Txtfile(data_files[0], firstline=False, source2idx=word2idx, label2idx=label2idx)
    # train_data = [sent[0] for sent in train_data]
    train_batch = vocab.minibatches(train_data, batch_size=batch_size)
    inpdata=[]
    outdata=[]
    for sent in train_batch:
        word_pad_ids, seq_lens = seqPAD.pad_sequences(sent, pad_tok=vocab.w2i[PAD])
        data_tensor = Data2tensor.idx2tensor(word_pad_ids)
        for i in range(0, data_tensor.size(1)-1, bptt):
            data, target = vocab.bptt_batch(data_tensor, i, bptt)
            inpdata.append(data)
            outdata.append(target)
        break

    rnn_type = "GRU"
    ntoken = len(vocab.w2i)
    ninp = 32
    nhid = 64