def part2(choose): train, test = get_text_classification_datasets() print(type(train)) dictionary = build_dict(train, 10) trainset_onehot, t = data_preprocess(train, dictionary) testset_onehot, t_ = data_preprocess(test, dictionary) w = np.zeros((len(dictionary) + 1, 4)) w_ = w j = 100 j_ = 99 count = 0 c = [] p = [] if choose == 1: while (count < 5000): count += 1 c.append(count) j = j_ w = w_ j_, w_ = onecycle(trainset_onehot, t, w, 1) p.append(j_) print("After ", count, "times training, the loss is:", j_) elif choose == 2: while (count < 2000): count += 1 c.append(count) j = j_ w = w_ j_, w_ = onecycle(trainset_onehot, t, w, 10) p.append(j_) print("After ", count, "times training, the loss is:", j_) elif choose == 3: while (count < 1000): count += 1 c.append(count) j = j_ w = w_ j_, w_ = onecycle(trainset_onehot, t, w, 100) p.append(j_) print("After ", count, "times training, the loss is:", j_) elif choose == 4: while (np.abs(j - j_) > 1e-4): count += 1 c.append(count) j = j_ w = w_ j_, w_ = onecycle(trainset_onehot, t, w, BATCH_SIZE) p.append(j_) print("After ", count, "times training, the loss is:", j_) #print(w) print('The train accuracy is', precision(w, trainset_onehot, train.target)) print('The test accuracy is', precision(w, testset_onehot, test.target)) plt.plot(c, p) plt.show()
def part2(): data_text_train, data_text_test = get_text_classification_datasets() try: with open('train_text.json', 'r', encoding='utf-8') as fp: train_text = json.load(fp) with open('train_res.json', 'r', encoding='utf-8') as fp: train_res = json.load(fp) with open('test_text.json', 'r', encoding='utf-8') as fp: test_text = json.load(fp) with open('test_res.json', 'r', encoding='utf-8') as fp: test_res = json.load(fp) except Exception as err: preprocess_get_test(data_text_test['data'], data_text_test['target'], preprocess_get_dic_train(data_text_train['data'], data_text_train['target'])) with open('train_text.json', 'r', encoding='utf-8') as fp: train_text = json.load(fp) with open('train_res.json', 'r', encoding='utf-8') as fp: train_res = json.load(fp) with open('test_text.json', 'r', encoding='utf-8') as fp: test_text = json.load(fp) with open('test_res.json', 'r', encoding='utf-8') as fp: test_res = json.load(fp) train_text, train_res = np.mat(train_text), np.mat(train_res) test_text, test_res = np.mat(test_text), np.mat(test_res) # select the function for FBGD, SGD or BGD w, b = logistic_algorithm_FBGD(train_text, train_res, 0.1, 2000, 0.001, 1e-5) # w, b = logistic_algorithm_SGD(train_text, train_res, 0.1, 5000, 0.001, 1e-5) # w, b = logistic_algorithm_BGD(train_text, train_res, 0.1, 300, 0.001, 100, 1e-5) acc_train, acc_test = get_accuracy(train_text, data_text_train.target, w, b), get_accuracy(test_text, data_text_test.target, w, b) print('The training accuracy is: ', acc_train) print('The test accuracy is: ', acc_test)
def logistic_regression(): text_train, text_test = get_text_classification_datasets() labels_train = creat_labels(text_train.target) my_dict = creat_dict(text_train.data) vectors_train = get_vectors(text_train.data, my_dict) w = np.zeros([vectors_train.shape[1],4]) lam = 1 iterations = 1000 learningRate = 0.005 losses = [] for i in range(0,iterations): loss,grad = getLoss(w,vectors_train,labels_train,lam) losses.append(loss) w = w - (learningRate * grad) plt.plot(losses) plt.show()
def data_preprocess(): train, test = get_text_classification_datasets() # build vacabulary train_item = [] word_count = {} vacab = {} regular = re.compile(r'[\s]+') for item in train.data: words = regular.split( item.translate(str.maketrans('', '', string.punctuation)).lower()) train_item.append(words) for word in words: if word in word_count: word_count[word] += 1 else: word_count[word] = 1 for word, count in word_count.items(): if count >= 10: # only record those >= 10 times vacab[word] = len(vacab) return train, test, vacab
def program_parser(): parser = argparse.ArgumentParser(description='Assignment 2') parser.add_argument('--algorithm', choices=["least_square", "perceptron", "logistic"], help='the algorithms') parser.add_argument('--n', choices=["run", "batch", "lambda", "alpha", "check"], default="run", help='the algorithms of logistic') args = parser.parse_args() linear_dataset = get_linear_seperatable_2d_2c_dataset() lsm = LSM(linear_dataset) perceptron = Perceptron(linear_dataset) algos = {"least_square": lsm.run, "perceptron": perceptron.run} if args.algorithm == "logistic": np.random.seed(2333) dataset_train, dataset_test = get_text_classification_datasets() logistic = Logistic(dataset_train, dataset_test) if args.n == "run": logistic.show() elif args.n == "check": logistic.check_gradient() elif args.n == "batch": logistic.show_batch_diff() elif args.n == "lambda": logistic.show_lamb_diff() elif args.n == "alpha": logistic.show_alpha_diff() elif args.algorithm in algos.keys(): algos[args.algorithm]() else: parser.print_help()
def get_dataset(): raw_train, raw_test = get_text_classification_datasets() def transfer(st): st = st.lower() for i in string.whitespace: st.replace(i, '') for i in string.punctuation: st.replace(i, ' ') return st def preprocess(): train_set = DataSet() for i in range(len(raw_train['data'])): di = transfer(raw_train['data'][i]) train_set.append( Instance(sentence=di, target=int(raw_train['target'][i]))) train_set.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') train_set.apply(lambda x: x['sentence'].split(), new_field_name='words') train_set.apply(lambda x: len(x['words']), new_field_name='seq_len') test_set = DataSet() for i in range(len(raw_test['data'])): di = transfer(raw_test['data'][i]) test_set.append( Instance(sentence=di, target=int(raw_test['target'][i]))) test_set.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') test_set.apply(lambda x: x['sentence'].split(), new_field_name='words') test_set.apply(lambda x: len(x['words']), new_field_name='seq_len') word_dict = Vocabulary(min_freq=2) train_set.apply(lambda x: [word_dict.add(word) for word in x['words']]) test_set.apply(lambda x: [word_dict.add(word) for word in x['words']]) word_dict.build_vocab() word_dict.index_dataset(train_set, field_name='words', new_field_name='words') word_dict.index_dataset(test_set, field_name='words', new_field_name='words') return train_set, test_set, word_dict train_set, test_set, word_dict = preprocess() train_set.rename_field('words', Const.INPUT) train_set.rename_field('seq_len', Const.INPUT_LEN) train_set.rename_field('target', Const.TARGET) test_set.rename_field('words', Const.INPUT) test_set.rename_field('seq_len', Const.INPUT_LEN) test_set.rename_field('target', Const.TARGET) train_set.set_input(Const.INPUT, Const.INPUT_LEN) train_set.set_target(Const.TARGET) test_set.set_input(Const.INPUT, Const.INPUT_LEN) test_set.set_target(Const.TARGET) return train_set, test_set, word_dict
LSM = LSM() LSM.train(train_set.X, 2 * train_set.y - 1) y_pred = LSM.predict(test_set.X) print("LSM accuracy: ", test_set.acc(y_pred)) d.plot(plt) LSM.plot(plt) # Part 2: perceptron perceptron = perceptron() perceptron.train(train_set.X, train_set.y) y_pred = perceptron.predict(test_set.X) print("perceptron accuracy: ", test_set.acc(y_pred)) plt.figure(0) perceptron.plot(plt) plt.legend(loc="upper right") plt.show() # Part 3: text classification text_train, text_test = get_text_classification_datasets() train_vector, test_vector = preprocess(text_train, text_test) N = train_vector.shape[0] plt.figure("text_classification") for j, i in enumerate([1, batch_size, N]): Softmax = softmax() Softmax.train(train_vector, text_train.target, batch_size = i) print(Softmax.accuracy(test_vector, text_test.target)) plt.subplot(3, 1, j + 1) Softmax.plot(plt) plt.show()
ax = plt.subplot(324) perceptron(data_sample.X, data_sample.y, 30, 0.004) ax = plt.subplot(325) perceptron(data_sample.X, data_sample.y, 30, 0.005) ax = plt.subplot(326) perceptron(data_sample.X, data_sample.y, 30, 0.006) # print("accuracy:"+accurate(data_sample.X,data_sample.y,w)) # data_sample.plot(plt).show() # In[5]: #part2 from handout import get_text_classification_datasets trainData, testData = get_text_classification_datasets() # In[6]: import string trainDataset = trainData['data'] def getListX(dataset): dic = {} for data in trainDataset: data = data.lower() for i in data: if i in string.punctuation: data = data.replace(i, " ") data = data.split()
max_epoch_num = args.max_epoch_num if args.auto_terminate == "True": auto_terminate = True else: auto_terminate = False observe_dif_times = args.observe_loss_sequence_length terminate_threshold = args.terminate_threshold np.random.seed(2019) if __name__ == '__main__': dataset_train, dataset_test = handout.get_text_classification_datasets() categories = dataset_train.target_names # training data and labels training_data = (dataset_train.data) training_labels = np.array((dataset_train.target)) clean_training_data = utils.clean_dataset(training_data) mapping_dict = utils.build_mapping_dict(clean_training_data) feature_vector = utils.data2vec(clean_training_data, mapping_dict) print(len(feature_vector[0])) # build model softmax_model = model.Softmax_CrossEntropy_model(class_num=len(categories), feature_length=feature_vector.shape[1], learning_rate=learning_rate,
def run_cnn(): dataset_train_p2, dataset_test_p2 = get_text_classification_datasets() line_len = len(dataset_train_p2.data) with open("formalized_train_data.csv", "w") as file: for i in range(line_len): file.write( document2line(dataset_train_p2.data[i]) + "\t" + str(dataset_train_p2.target[i]) + '\n') file.close() line_len = len(dataset_test_p2.data) with open("formalized_test_data.csv", "w") as file2: for i in range(line_len): file2.write( document2line(dataset_test_p2.data[i]) + "\t" + str(dataset_test_p2.target[i]) + '\n') file2.close() loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t') train_dataset = loader.load("./formalized_train_data.csv") test_dataset = loader.load("./formalized_test_data.csv") os.remove("./formalized_train_data.csv") os.remove("./formalized_test_data.csv") train_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') train_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) test_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') test_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) #train_dataset[0],test_dataset[0] from fastNLP import Vocabulary # 使用Vocabulary类统计单词,并将单词序列转化为数字序列 vocab = Vocabulary(min_freq=2).from_dataset(train_dataset, field_name='words') vocab.index_dataset(train_dataset, field_name='words', new_field_name='words') vocab.index_dataset(test_dataset, field_name='words', new_field_name='words') #train_dataset[0],test_dataset[0] # 将label转为整数,并设置为 target train_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) test_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) #train_dataset[0],test_dataset[0] from fastNLP.models import CNNText embed_dim = 2048 #50 model = CNNText((len(vocab), embed_dim), num_classes=4, padding=2, dropout=0.1) model from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric # 定义trainer并进行训练 trainer = Trainer(model=model, train_data=train_dataset, dev_data=test_dataset, loss=CrossEntropyLoss(), metrics=AccuracyMetric()) trainer.train()
def run_rnn(): dataset_train_p2, dataset_test_p2 = get_text_classification_datasets() line_len = len(dataset_train_p2.data) with open("formalized_train_data.csv", "w") as file: for i in range(line_len): file.write( document2line(dataset_train_p2.data[i]) + "\t" + str(dataset_train_p2.target[i]) + '\n') file.close() line_len = len(dataset_test_p2.data) with open("formalized_test_data.csv", "w") as file2: for i in range(line_len): file2.write( document2line(dataset_test_p2.data[i]) + "\t" + str(dataset_test_p2.target[i]) + '\n') file2.close() loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t') train_dataset = loader.load("./formalized_train_data.csv") test_dataset = loader.load("./formalized_test_data.csv") train_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') train_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) test_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') test_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) from fastNLP import Vocabulary # 使用Vocabulary类统计单词,并将单词序列转化为数字序列 vocab = Vocabulary(min_freq=2).from_dataset(train_dataset, field_name='words') vocab.index_dataset(train_dataset, field_name='words', new_field_name='words') vocab.index_dataset(test_dataset, field_name='words', new_field_name='words') # 将label转为整数,并设置为 target train_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) test_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) embed_dim = 1024 hidden_dim = 128 layer = 4 model = Rnn(len(vocab), embed_dim, hidden_dim, layer, 4) use_gpu = torch.cuda.is_available() # 判断是否有GPU加速 if use_gpu: model = model.cuda() trainer = Trainer(model=model, train_data=train_dataset, dev_data=test_dataset, loss=CrossEntropyLoss(), n_epochs=100, metrics=AccuracyMetric()) trainer.train()
for t in ts: temp = [0] * K temp[t] = 1 ys.append(temp) for t in tts: temp = [0] * K temp[t] = 1 tys.append(temp) return ys, tys if __name__ == '__main__': x = get_text_classification_datasets() # x[0].data[i] # x[0].target[i] xs, txs = preprocess_data(x[0].data, x[1].data) ts, tts = preprocess_target(len(x[0].target_names), x[0].target, x[1].target) # print (len(x[0].target_names)) # print (ts) print('preprocessing finish') lc = logistic_classifier(xs, ts) # Test
draw(plt, X_t, ds_test.y, W, "Picture 4 Test Data") draw_taining_loss(plt, epoch, lossHistory) Perceptron(d, 100, 10, 0.01, plt) ## PART 2 import string from handout import get_linear_seperatable_2d_2c_dataset, get_text_classification_datasets import numpy as np import matplotlib.pyplot as plt import math categories, dataset_train_p2, dataset_test_p2 = get_text_classification_datasets( ) def document_embeding(dataset_train): ## build list of string from train data list_of_string = [] tokenized_sentences = [] trainslator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) for ele in dataset_train.data: # remove repeated data in one line sentence = list( set(ele.lower().translate(trainslator).replace( string.whitespace, ' ').split())) tokenized_sentences.append(sentence) list_of_string += sentence
def task2(): dataset_train, dataset_test = handout.get_text_classification_datasets() N = len(dataset_train.data) global categories_size, dimension, lam, rate lam = 0.1 rate = 0.1 categories_size = len(dataset_train.target_names) count = {} dataset_train.data = deal_text(dataset_train.data) dataset_test.data = deal_text(dataset_test.data) for text in dataset_train.data: words = text.split(' ') for word in words: if word != "": if word not in count: count[word] = 0 count[word] += 1 vocabulary = {} dimension = 0 for word in count: if count[word] >= 10: vocabulary[word] = dimension dimension += 1 X, Y = deal_dateset(dataset_train, vocabulary) tX, tY = deal_dateset(dataset_test, vocabulary) batch_size = int(input("batch size =")) batches = [[X[k:k + batch_size], Y[k:k + batch_size]] for k in range(0, N, batch_size)] W = np.zeros((dimension, categories_size)) b = np.zeros((categories_size, 1)) lam = 0.001 rate = 0.2 epoch = 0 his_loss = 100000 min_loss = 100000 max_test_accurcy = 0 min_loss_epoch = 0 train_loss_array = [] test_loss_array = [] rate_array = [] while True: random.shuffle(batches) # update_batch(batches[0], W, b, 1) epoch += 1 print("epoch: ", epoch) for batch in batches: W, b = update_batch(batch, W, b, 0) rate_array.append(rate) loss, accuracy = test(X, Y, W, b) print("train dataset loss=", loss, ", accuracy=", accuracy) # print("learn rate = ", rate) test_loss, test_accuracy = test(tX, tY, W, b) print("test dataset loss=", test_loss, ", accuracy=", test_accuracy) if test_loss < his_loss: rate = rate * 1.05 else: rate = 0.1 his_loss = test_loss train_loss_array.append(loss) test_loss_array.append(test_loss) if test_loss < min_loss: min_loss = test_loss min_loss_epoch = epoch elif epoch - min_loss_epoch > 20: break max_test_accurcy = max(max_test_accurcy, test_accuracy) print("max test dataset accurcy = ", max_test_accurcy) time = np.arange(epoch) plt.plot(time, train_loss_array) plt.plot(time, test_loss_array) plt.legend(["train loss", "test loss"]) plt.show() plt.plot(time, rate_array) plt.legend(["learning rate"]) plt.show()
Train = trainer(model=model, data=data, kwargs=kwargs.copy()) model, info = Train.train() # plot plot_loss_acc(kwargs.copy(), info) pred_test_y, _ = model.loss(test_x) test_acc = np.sum(np.argmax(pred_test_y, axis=1) == np.argmax(test_y, axis=1)) / len(test_x) print("test acc: ") print(test_acc, test) if __name__ == "__main__": # load data data_set, test_set = get_text_classification_datasets() # initialize data processor data = {} dp = data_processor() size_voca = dp.generate_vocabulary(data_set.data) # split data raw_data, num_classes = split_data(split_point=2000, data_set=data_set) # process data data["train_x"], data["train_y"] = dp.process_data( raw_data["train_x"], raw_data["train_y"], num_classes) data["val_x"], data["val_y"] = dp.process_data( raw_data["val_x"], raw_data["val_y"], num_classes) # choose learning rate