def __init__(self, num_labels): super(QuestionClassifier, self).__init__() # n_hidden = 256 self.f1 = nn.Linear(int(conf.get("param", "word_embedding_dim")), num_labels) # self.f2 = nn.Linear(n_hidden, num_labels) self.double() # loss self.loss_function = nn.CrossEntropyLoss() # optimizer self.optimizer = optim.Adam(self.parameters(), lr=float(conf.get("param", "lr_param")))
def train(): train_sentence_vectors, train_labels, dev_sentence_vectors, dev_labels, test_sentence_vectors, test_labels = sentence_vector.bag_of_word_sentences( type=conf.get("param", "word_embedding_type"), freeze=True) output_size = len(set(train_labels)) model = QuestionClassifier(output_size) # save test data model.test_vecs = test_sentence_vectors model.test_label = test_labels model.label_to_ix = gv.label_to_ix for epoch in range(int(conf.get("param", "epoch"))): model.train_model(train_sentence_vectors, train_labels) # validate the model acc = model.test_model(dev_sentence_vectors, dev_labels) print('epoch:', epoch, ' dev_acc: ', acc) torch.save(model, conf.get("param", "path_model"))
def test(): _, _, _, label2idx = process_train_set('../data/train.txt') idx2label = dict(zip(label2idx.values(), label2idx.keys())) model = torch.load(conf.get('param', 'path_model')) model.to('cpu') acc, pre_label = model.test_model(model.test_vecs, model.test_label) print('test_acc: ', acc) with open('../data/test.txt', 'r') as f: data = f.readlines() labels = [] sentences = [] for line in data: s = line.split(' ', maxsplit=1) labels.append(s[0]) sentences.append(s[1][:-1]) with open(gv.conf.get('param', 'path_eval_result'), "w") as f: lines = [ 'Question Correct Label Predict Label\n' ] for i in range(len(sentences)): line = [sentences[i], labels[i], idx2label[int(pre_label[i])]] s = ' '.join(line) s += '\n' lines.append(s) f.writelines(lines)
def train(): bilstm_test.train_Bilstm() train_sentence_vectors, train_labels = readFile("../data/train_.txt") dev_sentence_vectors, dev_labels = readFile("../data/dev_.txt") test_sentence_vectors, test_labels = readFile("../data/test_.txt") output_size = len(set(train_labels)) model = QuestionClassifier(output_size) model.test_vecs = test_sentence_vectors model.test_label = test_labels for epoch in range(int(conf.get("param", "epoch"))): model.train_model(train_sentence_vectors, train_labels) acc, labels = model.test_model(dev_sentence_vectors, dev_labels) print('epoch:', epoch, 'dev_acc: ', acc) torch.save(model, conf.get("param", "path_model"))
def test(): model = torch.load(conf.get('param', 'path_model')) model.to('cpu') # test the model acc = model.test_model(model.test_vecs, model.test_label, output_predict=True) print('test_acc: ', acc)
def make_bow_vector(tokens,wordToIdx,wordVec): vec = np.zeros(int(conf.get("param","word_embedding_dim"))) count = 0 for word in tokens: if word in wordToIdx.keys(): vector = wordVec[wordToIdx[word]] vec += vector count+=1 vec = vec / count vec = torch.from_numpy(vec) return vec.view(1, -1)
def to_dataloader(feature, labels): labels = np.array(labels) train_data = TensorDataset(torch.from_numpy(feature), torch.from_numpy(labels)) # dataloaders batch_size = int(conf.get('param', 'batch_size')) # make sure the SHUFFLE your training data train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) return train_loader
def get_pre_train_vector(): print('Please wait, pre-train...') sentences = word2vec.preprocessing.get_preprocessed_sentences() sorted_words = word2vec.preprocessing.make_vocabulary(sentences) word_idx, idx_word = word2vec.create_dict(sorted_words) sentences_in_idx = word2vec.replace_words_with_idx(sentences, word_idx) # this word_to_vec = word2vec.train(len(sorted_words), int(conf.get('param', 'word_embedding_dim')), sentences_in_idx, idx_word) return word_to_vec, word_idx
def get_train_dev(): # open the data file path = os.path.join(os.getcwd(), "..", "data", "train_5500.txt") f = open(path) lines = f.readlines() f.close() # split it into 1:9 train, dev = random_split(lines, shuffle=True, ratio=0.9) # write the train data into the train.txt file = open(conf.get('param', 'path_train'), 'w') for i in range(len(train)): file.write(train[i]) file.close() # write the dev data into the train.txt file = open(conf.get('param', 'path_dev'), 'w') for i in range(len(dev)): file.write(dev[i]) file.close()
def train_Bilstm(): train_int_word, train_int_label, word2idx, lable2idx = process_train_set( conf.get('param', 'path_train')) test_int_word, test_int_label = process_new_dataset( word2idx, lable2idx, conf.get('param', 'path_dev')) seq_length = 10 feature_metrix = padding_feature(train_int_word, seq_length) feature_metrix_dev = padding_feature(test_int_word, seq_length) train_loader = to_dataloader(feature_metrix, train_int_label) test_loader = to_dataloader(feature_metrix_dev, test_int_label) rnn_ = BiLSTMTagger(len(word2idx), int(conf.get("param", "word_embedding_dim")), 100) if bool(conf.get('param', 'pre_train')): word2vec = w2v.read_word2vec(conf.get("param", "path_pre_emb")) vocab = torch.tensor(word2vec) pretrained_embedding = vocab print('pretrained_embedding:', pretrained_embedding.shape) rnn_.embedding.from_pretrained(pretrained_embedding, freeze=bool(conf.get('param', 'freeze'))) device = 'cpu' optimizer = optim.Adam(rnn_.parameters(), lr=float(conf.get("param", "lr_param"))) criteon = nn.CrossEntropyLoss().to(device) rnn_.to(device) # print(rnn_.to(device)) for epoch in range(int(conf.get("param", "epoch"))): cells = train_another_new(rnn_, train_loader, optimizer, criteon) eval(rnn_, test_loader, criteon, patience=int(conf.get('param', 'early_stopping'))) torch.save(rnn_, '../data/word2vec_Bilstm_3.pkl') #print('```````````````') classifier.vector_file()
def randomly_initialised_vectors(tokens=None,threshold=None): wordCountDict = dict(zip(*np.unique(tokens, return_counts=True))) for k in list(wordCountDict.keys()): # 对字典a中的keys,相当于形成列表list if wordCountDict[k] < threshold: del wordCountDict[k] wordToIx = {} wordToIx['UNK'] = 0 i = 1 for key in wordCountDict.keys(): wordToIx[key] = i i = i+1 word_vectors = [] for _ in wordToIx: word_vectors.append(np.random.random(int(conf.get("param","word_embedding_dim")))) word_vectors = np.array(word_vectors) return word_vectors,wordToIx
def bag_of_word_sentences(type='randomly',freeze=True): if type not in ['randomly','pre_train']: return train_labels, train_sentences = sentence_processing(conf.get('param', 'path_train')) dev_labels, dev_sentences = sentence_processing(conf.get('param', 'path_dev')) test_labels, test_sentences = sentence_processing(conf.get('param', 'path_test')) train_sentences = lower_first_letter(train_sentences,conf.get('param','lowercase')) test_sentences = lower_first_letter(test_sentences,conf.get('param','lowercase')) dev_sentences = lower_first_letter(dev_sentences,conf.get('param','lowercase')) read_stop = read_stoplist() train_tokens, train_token_of_sentences = tokenization(train_sentences, read_stop) dev_tokens, dev_token_of_sentences = tokenization(dev_sentences, read_stop) test_tokens, test_token_of_sentences = tokenization(test_sentences, read_stop) wordVec, wordToIdx = get_word_embedding(tokens=train_tokens, type=type, freeze=freeze, path='../to_be_merged/train_1000.txt') train_sentence_vectors = multi_sentences_to_vectors(train_token_of_sentences,wordToIdx,wordVec) test_sentence_vectors = multi_sentences_to_vectors(test_token_of_sentences,wordToIdx,wordVec) dev_sentence_vectors = multi_sentences_to_vectors(dev_token_of_sentences,wordToIdx,wordVec) train_labels,dev_labels,test_labels = get_label_number_to_idx(train_labels,dev_labels,test_labels) return train_sentence_vectors,train_labels,dev_sentence_vectors,dev_labels,test_sentence_vectors,test_labels
To generate vector of tokens from pre_train model. return: word_vectors: the vectors of words wordToIx: a map that the key is the word, and value is its corresponding index. ''' def get_pre_train_vector(): print('Please wait, pre-train...') sentences = word2vec.preprocessing.get_preprocessed_sentences() sorted_words = word2vec.preprocessing.make_vocabulary(sentences) word_idx, idx_word = word2vec.create_dict(sorted_words) sentences_in_idx = word2vec.replace_words_with_idx(sentences, word_idx) # this word_to_vec = word2vec.train(len(sorted_words), int(conf.get('param', 'word_embedding_dim')), sentences_in_idx, idx_word) return word_to_vec, word_idx if __name__ == '__main__': labels, sentences = sentence_processing(conf.get('param', 'path_train')) sentences = lower_first_letter(sentences,conf.get('param','lowercase')) read_stoplist = read_stoplist() tokens, token_of_sentences = tokenization(sentences, read_stoplist) # idx wordVec, wordToIdx = randomly_initialised_vectors(tokens, threshold=0) # print(wordVec)
torch.save(model, conf.get("param", "path_model")) def test(): model = torch.load(conf.get('param', 'path_model')) model.to('cpu') # test the model acc = model.test_model(model.test_vecs, model.test_label, output_predict=True) print('test_acc: ', acc) if __name__ == '__main__': # choose randomly or pre_train here train_sentence_vectors, train_labels, dev_sentence_vectors, dev_labels, test_sentence_vectors, test_labels = sentence_vector.bag_of_word_sentences( type='pre_train', freeze=True) output_size = len(set(train_labels)) model = QuestionClassifier(output_size) for epoch in range(int(conf.get("param", "epoch"))): model.train_model(train_sentence_vectors, train_labels) # calculate correct rate of validation dataset acc = model.test_model(dev_sentence_vectors, dev_labels) print('epoch:', epoch, ' dev_acc: ', acc) # calculate correct rate of test dataset acc = model.test_model(test_sentence_vectors, test_labels) print('test_acc: ', acc)