Esempio n. 1
0
    def build_data(args):
        print("Building dataset...")
        if not os.path.exists(args.model_dir):
            os.mkdir(args.model_dir)
        vocab = Vocab(wl_th=args.wl_th, wcutoff=args.wcutoff)

        vocab.build(fname=args.train_file,
                    idf_file=args.idf_file,
                    firstline=False,
                    limit=args.sent_limit)
        args.vocab = vocab
        if args.word_emb_file is not None:
            scale = np.sqrt(3.0 / args.word_dim)
            args.word_pretrained = Embeddings.get_W(args.word_emb_file,
                                                    args.word_dim, vocab.w2i,
                                                    scale)
        else:
            args.word_pretrained = None

        if os.path.exists(args.idf_file):
            print("Load idf file ...")
            args.idf_embs = Embeddings.get_W(args.idf_file, 1, vocab.w2i, 0)
        else:
            args.idf_embs = None

        SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args))
        return args
Esempio n. 2
0
    def inference(self, label_score, k=1):
        if self.num_labels > 2:
            label_prob = F.softmax(label_score, dim=-1)
            label_prob, label_pred = label_prob.data.topk(k)
        else:
            label_prob = F.sigmoid(label_score.squeeze())
            label_pred = (label_prob >= 0.5).data.long()
        return label_prob, label_pred


if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, Csvfile
    filename = "/media/data/langID/small_scale/train.csv"
    vocab = Vocab(cl_th=None, cutoff=1, c_lower=False, c_norm=False)
    vocab.build([filename], firstline=False)
    word2idx = vocab.wd2idx(vocab.c2i)
    tag2idx = vocab.tag2idx(vocab.l2i)
    train_data = Csvfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         tag2idx=tag2idx)

    train_iters = Vocab.minibatches(train_data, batch_size=10)
    data = []
    label_ids = []
    for words, labels in train_iters:
        data.append(words)
        label_ids.append(labels)
        word_ids, sequence_lengths = seqPAD.pad_sequences(words,
                                                          pad_tok=0,
Esempio n. 3
0
        decoded_scores = self.scorer_layer(h_n_drop)
        # YOUR CODE ENDS HERE
        #######################
        return decoded_scores, rec_hidden, rec_output


if __name__ == '__main__':
    from data_utils import Vocab, Txtfile, Data2tensor, seqPAD, PAD
    cutoff = 5
    wl_th = -1
    batch_size = 16

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data_files = ["../dataset/train.small.txt"]
    vocab = Vocab(wl_th=wl_th, cutoff=cutoff)
    vocab.build(data_files, firstline=False)
    word2idx = vocab.wd2idx(vocab.w2i)
    label2idx = vocab.tag2idx(vocab.l2i)

    rec_type = "LSTM"
    ntoken = len(vocab.w2i)
    nlabels = len(vocab.l2i)
    emb_size = 50
    hidden_size = 64
    nlayers = 2
    dropout = 0.5
    bidirect = False

    #embedding_matrix=create_embedding_matrix(vocab,ntoken,emb_size)
    #print(embedding_matrix[5])
    #embedding = nn.Embedding.from_pretrained(embedding_matrix)
Esempio n. 4
0
    def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover):
        # (batch_size,sequence_len,hidden_dim)
        rnn_out = self.lstm.get_all_atthiddens(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
        # (batch_size,sequence_len,num_labels+2)
        label_score = self.hidden2tag(rnn_out)
        label_score = self.dropfinal(label_score)
        return label_score
    
if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, CoNLLDataset
    train_file='/media/data/NER/conll03/conll03/train.bmes'
    dev_file='/media/data/NER/conll03/conll03/dev.bmes'
    test_file='/media/data/NER/conll03/conll03/test.bmes'
    vocab = Vocab(cutoff=1, wl_th=None, cl_th=None, w_lower=False, w_norm=False, c_lower=False, c_norm=False)
    vocab.build([train_file, dev_file, test_file])
    
    
    word2idx = vocab.wd2idx(vocab_words=vocab.w2i, vocab_chars=vocab.c2i, allow_unk=True, start_end=True)
    tag2idx = vocab.tag2idx(vocab_tags=vocab.l2i, start_end=True)
    train_data = CoNLLDataset(train_file, word2idx=word2idx, tag2idx=tag2idx)
    train_iters = Vocab.minibatches(train_data, batch_size=10)
    data=[]
    label_ids = []
    for words, labels in train_iters:
        char_ids, word_ids = zip(*words)
        data.append(words)
        word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids, pad_tok=0, wthres=1024, cthres=32)
        char_ids, word_lengths = seqPAD.pad_sequences(char_ids, pad_tok=0, nlevels=2, wthres=1024, cthres=32)
        label_ids, label_lengths = seqPAD.pad_sequences(labels, pad_tok=0, wthres=1024, cthres=32)
    
Esempio n. 5
0
        abs_distance = torch.max(distance, torch.zeros_like(distance))
        ranking = abs_distance.sum(-1)
        reg = self.regularized()
        return ranking.mean() + reg


if __name__ == "__main__":
    import random
    from data_utils import Data2tensor, Vocab, seqPAD, Txtfile, PADt, Embeddings
    Data2tensor.set_randseed(1234)
    use_cuda = torch.cuda.is_available()
    filename = "/media/data/restaurants/yelp_dataset/processed/extracted_rev/yelp_data_rev.pro.txt"
    idf_file = "./idf.txt"

    vocab = Vocab(wl_th=None, wcutoff=5)
    vocab.build(filename, idf_file=idf_file, firstline=False, limit=100000)

    word2idx = vocab.wd2idx(vocab_words=vocab.w2i,
                            unk_words=True,
                            se_words=False)

    train_data = Txtfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         limit=100000)

    batch_size = 8
    neg_sampling = 5
    no_chunks = batch_size * (neg_sampling + 1)
    train_iters = Vocab.minibatches(train_data, batch_size=no_chunks)
    data = []