Ejemplo n.º 1
0
def main():
    print("rnn algorithm")
    train_data, labels = loadDataSet("./data/train.tsv")
    test_data, _ = loadDataSet('./data/test.tsv', 1)

    train_x, test_x, train_y, test_y = data_split(train_data, labels, 0.1, 42)
    # 所有文件中最长的评论长度
    # max_sent_len = 56

    # 只使用训练样本中出现的词
    vocabListTrainData = createVocabList(train_data)
    # 使用测试样本出现的词
    vocabListTestData = createVocabList(test_data)
    # 使用词表中的所有词
    # 这里犯了一个很大的错误, 只使用了一个 或运算 来获取vocabList
    # set是使用散列表实现的,是无序的,所以每次重新运行代码,最终得到的embedding都是不一样的。
    vocabList = vocabListTrainData | vocabListTestData
    vocabList = sorted(vocabList)

    use_cuda = torch.cuda.is_available()

    torch.manual_seed(64)

    device = torch.device("cuda" if use_cuda else "cpu")

    batch = 64
    epoch = 8
    embed_size = 100
    hidden_size = 50

    model = RNN(embed_size, hidden_size, vocabList, device).to(device)

    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

    flag = 0
    if flag == 0:
        s = time.time()
        train(model, device, train_x, train_y, optimizer, epoch, batch, 0.2)
        e = time.time()
        print("train time is : ", (e-s)/60.)
    else:
        model.load_state_dict(torch.load('./data/rnn_params.pth'))

    test(model, device, train_x, train_y)
    test(model, device, test_x, test_y)

    kaggleTest(model, './data/kaggleData.csv')
Ejemplo n.º 2
0
def kaggleTest(model, filePath):
    test_data, labels = loadDataSet('./data/test.tsv', 1)
    model.eval()
    output = model(test_data)
    predict = torch.argmax(output, dim=1)

    tid = [156061 + i for i in range(len(predict))]
    kaggle_data = list(zip(tid, predict.numpy().tolist()))

    print('the test data count is : ', len(predict))
    # print(kaggle_data)
    # newline='', 就不会产生空行
    with open(filePath, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['PhraseId', 'Sentiment'])
        writer.writerows(kaggle_data)
Ejemplo n.º 3
0
def kaggleTest(model, filePath):
    test_data, labels = loadDataSet('./data/test.tsv', 1)

    batch_size = 200
    cnt = len(test_data)
    # newline='' 不会产生空行
    with open(filePath, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['PhraseId', 'Sentiment'])

        number = 156061
        for data in batch_iter_test(test_data, batch_size):
            model.eval()
            output = model(data)
            predict = torch.argmax(output, dim=1)

            tid = [number + i for i in range(len(predict))]
            kaggle_data = list(zip(tid, predict.cpu().numpy().tolist()))

            number += len(predict)
            writer.writerows(kaggle_data)

    print("the amount of data is : ", cnt)
Ejemplo n.º 4
0
    tid = [156061 + i for i in range(len(predict))]
    kaggle_data = list(zip(tid, predict.tolist()))

    print('the test data count is : ', len(predict))
    # print(kaggle_data)
    # newline='', 就不会产生空行
    with open(filePath, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['PhraseId', 'Sentiment'])
        writer.writerows(kaggle_data)


if __name__ == "__main__":
    print("bayes algrithm")
    train_data, labels = loadDataSet("./data/train.tsv")
    maxLen = 0
    for it in train_data:
        maxLen = max(maxLen, len(it))
    print('the max len is : ', maxLen)

    train_x, test_x, train_y, test_y = data_split(train_data, labels, 0.1, 42)
    vocabList = createVocabList(train_x)
    train_x_vec = []
    print('change train data to vector.')
    for i, it in tqdm(enumerate(train_x)):
        train_x_vec.append(bagOfWord2Vec(vocabList, it))
    pw, pc = train(np.array(train_x_vec), np.array(train_y))

    test_x_vec = []
    print('change test data to vector')
Ejemplo n.º 5
0
	"""
	parser = argparse.ArgumentParser(description='logsticRegression')
	parser.add_argument('--mode', type=str, default='SGD')
	parser.add_argument('--plot', action='store_true', default=False,
						help='whether only plot or not (default=False)')
	parser.add_argument('--pretrain', action='store_true', default=False,
						help='use pretrained weights or initial weights (default=False)')
	parser.add_argument('--save', action='store_true', default=False,
						help='save weights, weights_history and cost_history (default=False)')
	args = parser.parse_args()
	#files' path of weights, wlist and clist.
	w_file = 'weights_LR_GD.txt' if args.mode == 'GD' else 'weights_LR_SGD.txt'
	wlist_file = 'w_list_LR_GD.txt' if args.mode == 'GD' else 'w_list_LR_SGD.txt'
	clist_file = 'c_list_LR_GD.txt' if args.mode == 'GD' else 'c_list_LR_SGD.txt'
	
	dataList, labelList = loadDataSet()
	if not args.plot:
		if args.pretrain:
			weights = grab(w_file)
			if args.mode =='GD':
				weights,w_list, cost_list = gradAscent(dataList, labelList, weights=weights, pre_train=True)
			elif args.mode=='SGD':
				weights, w_list, cost_list = stocGradAscent(dataList, labelList, weights=weights, pre_train=True)
		else:
			if args.mode=='GD':
				weights, w_list, cost_list = gradAscent(dataList, labelList)
			elif args.mode=='SGD':
				weights, w_list, cost_list = stocGradAscent(dataList, labelList)
			if args.save:
				store(weights, w_file)
				store(w_list, wlist_file)
Ejemplo n.º 6
0
import numpy as np
import pandas as pd
import utils as utils

from scipy.io import loadmat

utils.clearConsole()

dataSet = utils.loadDataSet()

X = dataSet.values[:, :24]  #Prefferably

y = dataSet.values[:, np.newaxis, 24].squeeze()  #Prefferably

attributeNames = list(dataSet)  #Attribute titles, used for plotting

K = 10  #Number of folds

#print(y)
#Simple crossValidation with 10 folds
# utils.crossValidation(X, y, attributeNames, K)
#utils.lambdaOptimalRegulation(X,y,attributeNames)
#utils.neuralNetwork(X, y)
utils.ANNFull()