def main(): print("rnn algorithm") train_data, labels = loadDataSet("./data/train.tsv") test_data, _ = loadDataSet('./data/test.tsv', 1) train_x, test_x, train_y, test_y = data_split(train_data, labels, 0.1, 42) # 所有文件中最长的评论长度 # max_sent_len = 56 # 只使用训练样本中出现的词 vocabListTrainData = createVocabList(train_data) # 使用测试样本出现的词 vocabListTestData = createVocabList(test_data) # 使用词表中的所有词 # 这里犯了一个很大的错误, 只使用了一个 或运算 来获取vocabList # set是使用散列表实现的,是无序的,所以每次重新运行代码,最终得到的embedding都是不一样的。 vocabList = vocabListTrainData | vocabListTestData vocabList = sorted(vocabList) use_cuda = torch.cuda.is_available() torch.manual_seed(64) device = torch.device("cuda" if use_cuda else "cpu") batch = 64 epoch = 8 embed_size = 100 hidden_size = 50 model = RNN(embed_size, hidden_size, vocabList, device).to(device) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9) flag = 0 if flag == 0: s = time.time() train(model, device, train_x, train_y, optimizer, epoch, batch, 0.2) e = time.time() print("train time is : ", (e-s)/60.) else: model.load_state_dict(torch.load('./data/rnn_params.pth')) test(model, device, train_x, train_y) test(model, device, test_x, test_y) kaggleTest(model, './data/kaggleData.csv')
def kaggleTest(model, filePath): test_data, labels = loadDataSet('./data/test.tsv', 1) model.eval() output = model(test_data) predict = torch.argmax(output, dim=1) tid = [156061 + i for i in range(len(predict))] kaggle_data = list(zip(tid, predict.numpy().tolist())) print('the test data count is : ', len(predict)) # print(kaggle_data) # newline='', 就不会产生空行 with open(filePath, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['PhraseId', 'Sentiment']) writer.writerows(kaggle_data)
def kaggleTest(model, filePath): test_data, labels = loadDataSet('./data/test.tsv', 1) batch_size = 200 cnt = len(test_data) # newline='' 不会产生空行 with open(filePath, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['PhraseId', 'Sentiment']) number = 156061 for data in batch_iter_test(test_data, batch_size): model.eval() output = model(data) predict = torch.argmax(output, dim=1) tid = [number + i for i in range(len(predict))] kaggle_data = list(zip(tid, predict.cpu().numpy().tolist())) number += len(predict) writer.writerows(kaggle_data) print("the amount of data is : ", cnt)
tid = [156061 + i for i in range(len(predict))] kaggle_data = list(zip(tid, predict.tolist())) print('the test data count is : ', len(predict)) # print(kaggle_data) # newline='', 就不会产生空行 with open(filePath, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['PhraseId', 'Sentiment']) writer.writerows(kaggle_data) if __name__ == "__main__": print("bayes algrithm") train_data, labels = loadDataSet("./data/train.tsv") maxLen = 0 for it in train_data: maxLen = max(maxLen, len(it)) print('the max len is : ', maxLen) train_x, test_x, train_y, test_y = data_split(train_data, labels, 0.1, 42) vocabList = createVocabList(train_x) train_x_vec = [] print('change train data to vector.') for i, it in tqdm(enumerate(train_x)): train_x_vec.append(bagOfWord2Vec(vocabList, it)) pw, pc = train(np.array(train_x_vec), np.array(train_y)) test_x_vec = [] print('change test data to vector')
""" parser = argparse.ArgumentParser(description='logsticRegression') parser.add_argument('--mode', type=str, default='SGD') parser.add_argument('--plot', action='store_true', default=False, help='whether only plot or not (default=False)') parser.add_argument('--pretrain', action='store_true', default=False, help='use pretrained weights or initial weights (default=False)') parser.add_argument('--save', action='store_true', default=False, help='save weights, weights_history and cost_history (default=False)') args = parser.parse_args() #files' path of weights, wlist and clist. w_file = 'weights_LR_GD.txt' if args.mode == 'GD' else 'weights_LR_SGD.txt' wlist_file = 'w_list_LR_GD.txt' if args.mode == 'GD' else 'w_list_LR_SGD.txt' clist_file = 'c_list_LR_GD.txt' if args.mode == 'GD' else 'c_list_LR_SGD.txt' dataList, labelList = loadDataSet() if not args.plot: if args.pretrain: weights = grab(w_file) if args.mode =='GD': weights,w_list, cost_list = gradAscent(dataList, labelList, weights=weights, pre_train=True) elif args.mode=='SGD': weights, w_list, cost_list = stocGradAscent(dataList, labelList, weights=weights, pre_train=True) else: if args.mode=='GD': weights, w_list, cost_list = gradAscent(dataList, labelList) elif args.mode=='SGD': weights, w_list, cost_list = stocGradAscent(dataList, labelList) if args.save: store(weights, w_file) store(w_list, wlist_file)
import numpy as np import pandas as pd import utils as utils from scipy.io import loadmat utils.clearConsole() dataSet = utils.loadDataSet() X = dataSet.values[:, :24] #Prefferably y = dataSet.values[:, np.newaxis, 24].squeeze() #Prefferably attributeNames = list(dataSet) #Attribute titles, used for plotting K = 10 #Number of folds #print(y) #Simple crossValidation with 10 folds # utils.crossValidation(X, y, attributeNames, K) #utils.lambdaOptimalRegulation(X,y,attributeNames) #utils.neuralNetwork(X, y) utils.ANNFull()