def build_data(args): print("Building dataset...") if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) vocab = Vocab(wl_th=args.wl_th, wcutoff=args.wcutoff) vocab.build(fname=args.train_file, idf_file=args.idf_file, firstline=False, limit=args.sent_limit) args.vocab = vocab if args.word_emb_file is not None: scale = np.sqrt(3.0 / args.word_dim) args.word_pretrained = Embeddings.get_W(args.word_emb_file, args.word_dim, vocab.w2i, scale) else: args.word_pretrained = None if os.path.exists(args.idf_file): print("Load idf file ...") args.idf_embs = Embeddings.get_W(args.idf_file, 1, vocab.w2i, 0) else: args.idf_embs = None SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args)) return args
def inference(self, label_score, k=1): if self.num_labels > 2: label_prob = F.softmax(label_score, dim=-1) label_prob, label_pred = label_prob.data.topk(k) else: label_prob = F.sigmoid(label_score.squeeze()) label_pred = (label_prob >= 0.5).data.long() return label_prob, label_pred if __name__ == "__main__": from data_utils import Data2tensor, Vocab, seqPAD, Csvfile filename = "/media/data/langID/small_scale/train.csv" vocab = Vocab(cl_th=None, cutoff=1, c_lower=False, c_norm=False) vocab.build([filename], firstline=False) word2idx = vocab.wd2idx(vocab.c2i) tag2idx = vocab.tag2idx(vocab.l2i) train_data = Csvfile(filename, firstline=False, word2idx=word2idx, tag2idx=tag2idx) train_iters = Vocab.minibatches(train_data, batch_size=10) data = [] label_ids = [] for words, labels in train_iters: data.append(words) label_ids.append(labels) word_ids, sequence_lengths = seqPAD.pad_sequences(words, pad_tok=0,
decoded_scores = self.scorer_layer(h_n_drop) # YOUR CODE ENDS HERE ####################### return decoded_scores, rec_hidden, rec_output if __name__ == '__main__': from data_utils import Vocab, Txtfile, Data2tensor, seqPAD, PAD cutoff = 5 wl_th = -1 batch_size = 16 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data_files = ["../dataset/train.small.txt"] vocab = Vocab(wl_th=wl_th, cutoff=cutoff) vocab.build(data_files, firstline=False) word2idx = vocab.wd2idx(vocab.w2i) label2idx = vocab.tag2idx(vocab.l2i) rec_type = "LSTM" ntoken = len(vocab.w2i) nlabels = len(vocab.l2i) emb_size = 50 hidden_size = 64 nlayers = 2 dropout = 0.5 bidirect = False #embedding_matrix=create_embedding_matrix(vocab,ntoken,emb_size) #print(embedding_matrix[5]) #embedding = nn.Embedding.from_pretrained(embedding_matrix)
def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): # (batch_size,sequence_len,hidden_dim) rnn_out = self.lstm.get_all_atthiddens(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) # (batch_size,sequence_len,num_labels+2) label_score = self.hidden2tag(rnn_out) label_score = self.dropfinal(label_score) return label_score if __name__ == "__main__": from data_utils import Data2tensor, Vocab, seqPAD, CoNLLDataset train_file='/media/data/NER/conll03/conll03/train.bmes' dev_file='/media/data/NER/conll03/conll03/dev.bmes' test_file='/media/data/NER/conll03/conll03/test.bmes' vocab = Vocab(cutoff=1, wl_th=None, cl_th=None, w_lower=False, w_norm=False, c_lower=False, c_norm=False) vocab.build([train_file, dev_file, test_file]) word2idx = vocab.wd2idx(vocab_words=vocab.w2i, vocab_chars=vocab.c2i, allow_unk=True, start_end=True) tag2idx = vocab.tag2idx(vocab_tags=vocab.l2i, start_end=True) train_data = CoNLLDataset(train_file, word2idx=word2idx, tag2idx=tag2idx) train_iters = Vocab.minibatches(train_data, batch_size=10) data=[] label_ids = [] for words, labels in train_iters: char_ids, word_ids = zip(*words) data.append(words) word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids, pad_tok=0, wthres=1024, cthres=32) char_ids, word_lengths = seqPAD.pad_sequences(char_ids, pad_tok=0, nlevels=2, wthres=1024, cthres=32) label_ids, label_lengths = seqPAD.pad_sequences(labels, pad_tok=0, wthres=1024, cthres=32)
abs_distance = torch.max(distance, torch.zeros_like(distance)) ranking = abs_distance.sum(-1) reg = self.regularized() return ranking.mean() + reg if __name__ == "__main__": import random from data_utils import Data2tensor, Vocab, seqPAD, Txtfile, PADt, Embeddings Data2tensor.set_randseed(1234) use_cuda = torch.cuda.is_available() filename = "/media/data/restaurants/yelp_dataset/processed/extracted_rev/yelp_data_rev.pro.txt" idf_file = "./idf.txt" vocab = Vocab(wl_th=None, wcutoff=5) vocab.build(filename, idf_file=idf_file, firstline=False, limit=100000) word2idx = vocab.wd2idx(vocab_words=vocab.w2i, unk_words=True, se_words=False) train_data = Txtfile(filename, firstline=False, word2idx=word2idx, limit=100000) batch_size = 8 neg_sampling = 5 no_chunks = batch_size * (neg_sampling + 1) train_iters = Vocab.minibatches(train_data, batch_size=no_chunks) data = []