Exemple #1
0
def test() : 
	import LoadData
	trainMat, classLabelVector = LoadData.loadTrainDataFromCSV(TRAIN_FILE)
	testMat = LoadData.loadTestDataFromCSV(TEST_FILE)
	rfbenchmarkVector  = LoadData.loadRFBenchmarkFromCSV(RF_BENCHMARK_FILE)

	columnLabels = []
	for i in range(1, 785) : 
		columnLabels.append(i)
	m = int(len(columnLabels) ** 0.5)
	# rf = createRandomForest(4, trainMat[50:], classLabelVector[50:], columnLabels, m)
	rf = createRandomForest(10, trainMat, classLabelVector, columnLabels, m)
	# testMat = trainMat[0:50]

	i = 0
	n = 0
	for testData in testMat : 
		classList = []
		for tree in rf : 
			label = classify(tree, columnLabels, testData)
			classList.append(label)
		voteLabel = majorityCnt(classList)
		if voteLabel == rfbenchmarkVector[i] : 
			n += 1
		# print "the real answer is ", classLabelVector[i], "the label is ", voteLabel
		i += 1
	print n
	accuracy = n / float(len(rfbenchmarkVector))
	print accuracy
def BPNet(file_name):
    rengong_filename = r'C:\Users\Administrator\Desktop\yanshan_rengong1.tif'
    P = []
    T = []
    butoushui_P = LoadData(1, file_name, rengong_filename)
    butoushui_P = RD.sample(butoushui_P, 2000)
    butoushui_P = sio.loadmat('../JadeLibSVM/' +
                              'butoushui_P.mat')['butoushui_P']
    M = len(butoushui_P)
    P = butoushui_P
    P = butoushui_P.tolist()
    T = [1] * M
    print M

    toushui_P = LoadData(0, file_name, rengong_filename)
    toushui_P = RD.sample(toushui_P, 2000)
    toushui_P = sio.loadmat('../JadeLibSVM/' + 'toushui_P.mat')['toushui_P']
    M = len(toushui_P)
    P.extend(toushui_P)
    toushui_P = [0] * M
    T.extend(toushui_P)
    print M

    nn = NeuralNetwork([3, 2, 1], 'tanh')
    nn.fit(P, T, 0.01, 5000)
    print('**************训练结束****************')
    p_test = extract_Yanshan('')
    predict_label = []
    for i in p_test:
        predict_label.append(nn.predict(i)[0])
    pic = array(Image.open(file_name))
    X = pic.shape[0]
    Y = pic.shape[1]
    P = pic.shape[2]
    Test_data = np.zeros((X * Y, 3), dtype='double')
    k = 0
    for i in range(X):
        for j in range(Y):
            Test_data[k, 0] = pic[i, j, 0]
            Test_data[k, 1] = pic[i, j, 1]
            Test_data[k, 2] = pic[i, j, 2]
            k = k + 1
    result = np.zeros((X, Y, 3))  #RGB彩图
    for k in range(X * Y):  #  R分量         G分量       B分量
        if (predict_label[k] >= 0.5):
            Test_data[k, 0] = 1
            Test_data[k, 1] = 1
            Test_data[k, 2] = 1  #白色
        elif (predict_label[k] < 0.5):
            Test_data[k, 0] = 0
            Test_data[k, 1] = 0
            Test_data[k, 2] = 0  #%黑色
    k = 0
    for i in range(X):
        for j in range(Y):
            result[i, j, 0] = Test_data[k, 0]
            result[i, j, 1] = Test_data[k, 1]
            result[i, j, 2] = Test_data[k, 2]
            k = k + 1
    return result
Exemple #3
0
def for_altair(stat_data):
    """

    :param stat_data:
    :param time_data:
    :return:
    """
    def country_search(name):
        country = pycountry.countries.get(name=name)
        if country is not None:
            return country.numeric
        else:
            country = pycountry.countries.search_fuzzy(name)
            return country[0].numeric

    print('country loop')
    stat_data['CountryCode'] = np.vectorize(country_search)(stat_data['country'])

    # print('melt')
    # data = pd.melt(stat_data, id_vars=['country',
    #                                    'date',
    #                                    'tournament',
    #                                    'total_games',
    #                                    'percent_wins',
    #                                    'CountryCode',
    #                                    'home_wins',
    #                                    'away_wins'],
    #                value_vars=['percent_home_wins',
    #                            'percent_away_wins'],
    #                var_name='statistics',
    #                value_name='values')

    DataLoader.save_to_sql(stat_data, "final_data")

    return stat_data
Exemple #4
0
def process_all_genre():
    tf_list = []
    tf_idf_list = []
    genres = __get_all_genre_list__()
    for genre in genres:
        tf_value = __get_tf_for_genre__(genre)
        for entry in tf_value:
            entry_dict = {'genre': genre}
            for key, value in entry.iteritems():
                entry_dict[key] = value
            tf_list.append(entry_dict)
        tf_idf_value = __get_tfidf_for_genre__(genre)
        for entry in tf_idf_value:
            entry_dict = {'genre': genre}
            for key, value in entry.iteritems():
                entry_dict[key] = value
            tf_idf_list.append(entry_dict)

    tf_data_frame = Data.pd.DataFrame(tf_list)
    tf_idf_data_frame = Data.pd.DataFrame(tf_idf_list)
    tf_data_frame['tfidfweight'] = tf_data_frame.apply(
        lambda new_row: tf_idf_data_frame[tf_idf_data_frame['tag'] == new_row[
            'tag']].iloc[0].tfidfweight,
        axis=1)
    Data.save_df(tf_data_frame, 'Genre-Model.csv')
Exemple #5
0
def process_user_model():
    tf_list = []
    tf_idf_list = []
    count = 0
    for user_id in Data.ml_ratings['userid'].unique():
        if count < 1000:
            tf_data = __get_tf_info__(user_id)
            for entry in tf_data:
                entry_dict = {}
                for key, value in entry.iteritems():
                    entry_dict[key] = value
                tf_list.append(entry_dict)

            tf_idf_data = __get_tfidf_info__(user_id)
            for entry in tf_idf_data:
                entry_dict = {}
                for key, value in entry.iteritems():
                    entry_dict[key] = value
                tf_idf_list.append(entry_dict)
        count += 1

    tf_data_frame = Data.pd.DataFrame(tf_list)
    if_idf_data_frame = Data.pd.DataFrame(tf_idf_list)
    tf_data_frame['tfidfweight'] = tf_data_frame.apply(
        lambda new_row: if_idf_data_frame[if_idf_data_frame['tag'] == new_row[
            'tag']].iloc[0].tfidfweight,
        axis=1)

    Data.save_df(tf_data_frame, 'User-Model.csv')
Exemple #6
0
def Train_Model():
    model = NNModel(0.001)
    # 28 * 28 == 784
    model.add_layer(28 * 28, 10, Func.relu)
    model.add_layer(10, 10, Func.relu)
    model.add_layer(10, 10, Func.relu)
    model.add_layer(10, 2, Func.identiti)
    model.add_loss_function(Func.cross_entropy_loss)

    epoch = 1000
    examples = 1000

    batch = 10
    x, y = LoadData.load_next_batch(examples, 0)
    x_test, y_test = LoadData.load_next_batch(examples, examples)
    for i in range(epoch):
        train_loss = 0
        for j in range(0, examples - batch, batch):
            # print(x.shape)
            train_loss += model.train(x[j:j + batch], y[j:j + batch])
        print('Train loss is at:', train_loss)
        loss = 0
        correct_guesses = 0
        for j in range(0, examples, 1):
            x_, y_ = x_test[j], y_test[j]
            output = model.forward_pass(x_)
            output = Func.sigm.f(output)
            # print(output)
            # print(y_)
            # print()
            loss += -np.sum(y_ * np.log(output))
            if np.argmax(y_) == np.argmax(output):
                correct_guesses += 1
        print('Loss after', i + 1, ':', loss)
        print('Correct guesses after ', i + 1, ':', correct_guesses / examples)
Exemple #7
0
def main():
    global domain
    global domain_distance
    global distance_file

    if not len(sys.argv) == 4:
        print(
            "python testBetaIC.py <k> <beta> <domain number> \n Domain num: \n 0 : accident, 1: sanitation, 2: crime, 3: adult"
        )
        return
    k = int(sys.argv[1])  #50
    beta = float(sys.argv[2])
    domain_num = int(sys.argv[3])

    domain = domain_arr[domain_num]
    domain_distance = distance_arr[domain_num]
    print(domain + " " + domain_distance)
    Ld = LoadData(domain)
    G = Ld.readFile()
    distance_file = ""
    if (os.path.isfile(domain + "_distance.txt")):
        distance_file = domain + "_distance.txt"

    print("Dataset:", domain, "K = ", k, "Distance:", domain_distance, "beta=",
          beta)
    aff_array = test_Kcenter(G, k, domain, domain_distance)
    print("\n")
    print(
        "#######################################################################\n"
    )
    bs = betaStrong(domain, G, aff_array, k, beta, domain_distance,
                    distance_file)
    aff_array = bs.beta_IC()
    calculate_composition(G, k, aff_array, domain)
    del Ld
Exemple #8
0
 def ModelInit(self, filename):
     Docs = LoadData.LoadDataFromFile(os.getcwd() + "/" + filename)
     self.D = len(Docs)
     print "Load ", self.D, " docs from the file"
     StopWordList = LoadData.LoadStopWords()
     WordListSet = [
         Preprocess.PreprocessText(doc, StopWordList) for doc in Docs
         if type(doc) != unicode
     ]
     self.Dictionary = Preprocess.ConstructDictionary(WordListSet)
     self.W = len(self.Dictionary)
     print "Total number of words is: ", self.W
     print "Begin to save the dictionary..."
     self.SaveDictionary()
     print "Done!!"
     print "Begin to map the word to ID"
     self.IDListSet = []
     inv_dict = {v: k for k, v in self.Dictionary.iteritems()}
     for wdl in WordListSet:
         IdList = Preprocess.Word2Id(wdl, inv_dict)
         self.IDListSet.append(IdList)
     print "Done!!"
     self.ndsum = ListUtil.Initial(self.D)
     self.theta = ListUtil.InitialMat(self.D, self.K, 0.0)
     self.phi = ListUtil.InitialMat(self.K, self.W, 0.0)
     self.nd = ListUtil.InitialMat(self.D, self.K, 0)
     self.nw = ListUtil.InitialMat(self.W, self.K, 0)
     self.Z = []
     print "Begin to initialize the LDA model..."
     self.RandomAssignTopic()
     print "Topic assignment done!!"
def testDigits(kTup=('rbf', 10)):
    dataArr, labelArr = ld.loadImages('trainingDigits')
    b, alphas = fksmo.smoPK(dataArr, labelArr, 200, 0.0001, 10000, kTup)
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    svInd = nonzero(alphas.A > 0)[0]
    sVs = datMat[svInd]
    labelSV = labelMat[svInd]
    print "there are %d Support Vectors" % shape(sVs)[0]
    m, n = shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = fksmo.kernelTrans(sVs, datMat[i, :], kTup)
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(labelArr[i]): errorCount += 1
    print "the training error rate is: %f" % (float(errorCount) / m)
    dataArr, labelArr = ld.loadImages('testDigits')
    errorCount = 0
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    m, n = shape(datMat)
    for i in range(m):
        kernelEval = fksmo.kernelTrans(sVs, datMat[i, :], kTup)
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(labelArr[i]): errorCount += 1
    print "the test error rate is: %f" % (float(errorCount) / m)
Exemple #10
0
def testRbf(k1=1.3):
    dataArr, labelArr = ld.loadDataSet('testSetRBF.txt')
    b, alphas = fksmo.smoPK(dataArr, labelArr, 200, 0.0001, 10000,
                            ('rbf', k1))  #C=200 important
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    svInd = nonzero(alphas.A > 0)[0]
    sVs = datMat[svInd]  #get matrix of only support vectors
    labelSV = labelMat[svInd]
    print "there are %d Support Vectors" % shape(sVs)[0]
    m, n = shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = fksom.kernelTrans(sVs, datMat[i, :], ('rbf', k1))
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(labelArr[i]): errorCount += 1
    print "the training error rate is: %f" % (float(errorCount) / m)
    dataArr, labelArr = ld.loadDataSet('testSetRBF2.txt')
    errorCount = 0
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    m, n = shape(datMat)
    for i in range(m):
        kernelEval = fksom.kernelTrans(sVs, datMat[i, :], ('rbf', k1))
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(labelArr[i]): errorCount += 1
    print "the test error rate is: %f" % (float(errorCount) / m)
Exemple #11
0
 def getLogisticRegression (self) :
     """ Fit the model to the trainig data."""
     
     ld=LoadData(self.argumentsDict)
     X,y=ld.loadTrainingDataSet()
     tm=TrainModels(X,y)
     model=tm.getModelLogistic()
     return model,ld
Exemple #12
0
 def getDummy (self) :
     """ Fit the model to the trainig data."""
     
     ld=LoadData(self.argumentsDict)
     X,y=ld.loadTrainingDataSet()
     tm=TrainModels(X,y)
     model=tm.getDummy()
     return model,ld
    def test(self):
        load_test = LoadData()
        self.folder_test = load_test.data_test
        self.train_generator, self.x_train, self.x_valid, self.y_train, self.y_valid = load_test.loadDataTrain(
        )
        self.test_generator, self.x_test = load_test.loadDataTest(
            self.folder_test)

        model = load_model('./model.h5')

        self.test_generator.reset()
        pred = model.predict_generator(self.test_generator,
                                       verbose=1,
                                       steps=600 / 1)

        predicted_class_indices = np.argmax(pred, axis=1)

        labels = (self.train_generator.class_indices)
        labels = dict((v, k) for k, v in labels.items())
        prediksi = [labels[k] for k in predicted_class_indices]
        path = self.test_generator.filenames

        filenames = []
        for x in range(len(path)):
            filenames.append(path[x][12:len(path[x]) - 8])

        true_pred = 0
        compare = []
        for x in range(len(filenames)):
            if filenames[x] == prediksi[x]:
                # true_pred = true_pred + 1
                compare.append("False")
            else:
                true_pred = true_pred + 1
                compare.append("True")

        row = len(self.test_generator)

        list_prediksi = []
        for i in range(row):
            list_prediksi.append([filenames[i], prediksi[i], compare[i]])

        # print result to console
        # s = ""
        # for i in range(row):
        #     print(i, list_prediksi[i])
        s = ''.join(prediksi[0:row])
        # print(s)

        self.progressBar.setValue(100)

        self.txtLR.setText(s)

        persentase = (true_pred / len(filenames)) * 100
        # print(persentase)

        self.lblHasil.setText("Tingkat Akurasi : %.2f%%" % (persentase))
Exemple #14
0
def num_correct(net, test_set):
    with torch.no_grad():
        for i, data in enumerate(test_set, 0):
            # Data to device (gpu)
            images, points = data['image'].to(device), data['points'].to(device)
            output = net(images)
            LoadData.show_batch({'image': images.to('cpu'), 'points': points.to('cpu')}, output.to('cpu'))
            if i == 5:
                break
Exemple #15
0
def mainCurves():
    LoadData.warning()

    first_X, first_Y = LoadData.contraceptiveData()
    first_graph_data = analyzePerNeighbor(first_X, first_Y)

    second_X, second_Y = LoadData.wineData()
    second_graph_data = analyzePerNeighbor(second_X, second_Y)

    graphDataCurves(first_graph_data, second_graph_data)
Exemple #16
0
def getDataSet(pDict, resdir):
    """Load the data set"""

    ld = LoadData(pDict)
    X_train, y_train = ld.loadTrainingDataSet()
    X_test, dbkeys = ld.loadTestDataSet()
    dictML = {
        "X_test": X_test,
        "X_train": X_train,
        "y_train": y_train,
        "resultsDirectory": resdir
    }
    return dictML, dbkeys
Exemple #17
0
def main():
    LoadData.warning()

    # Building Phase
    first_X, first_Y = LoadData.contraceptiveData()
    clf_first, first_training_score, first_training_data, first_testing_data, first_graph_data = analyze(
        first_X, first_Y, 40)
    print("kNN Training Score (first) After Cross Validation: {0:.2f}%".format(
        first_training_score * 100))
    LoadData.calc_accuracy(first_training_data[1],
                           clf_first.predict(first_training_data[0]),
                           first_testing_data[1],
                           clf_first.predict(first_testing_data[0]))

    second_X, second_Y = LoadData.wineData()
    clf_second, second_training_score, second_training_data, second_testing_data, second_graph_data = analyze(
        second_X, second_Y)
    print(
        "kNN Training Score (second) After GridSearch Cross Validation: {0:.2f}%"
        .format(second_training_score * 100))
    LoadData.calc_accuracy(second_training_data[1],
                           clf_second.predict(second_training_data[0]),
                           second_testing_data[1],
                           clf_second.predict(second_testing_data[0]))

    graphData(first_graph_data, second_graph_data)
Exemple #18
0
def detect_burr(data,
                pv,
                left=None,
                right=None,
                method=0,
                minimum_peak_distance=100):
    titles = data.columns
    titleList = titles.values.tolist()
    if pv in titleList:
        pvn = titleList.index(pv)
        sta = DisplayData.showStatistic(data)
        print("statistic data:")
        print(sta)
        # use boxplot define threshold
        iqr = sta.loc['75%'][titles[pvn]] - sta.loc['25%'][titles[pvn]]
        if left is None:
            left = sta.loc['25%'][titles[pvn]] - 1.5 * iqr
        if right is None:
            right = sta.loc['75%'][titles[pvn]] + 1.5 * iqr
        print('min edge:', left, 'max edge:', right)
        burrdata = data[((data[titles[pvn]]) < left) |
                        ((data[titles[pvn]]) > right)]
        LoadData.df2other(burrdata, 'csv', 'newfile.csv')
        y = data[titles[pvn]].values
        if method == 0:
            # find_peaks by scipy signal
            peaks, _ = signal.find_peaks(y, height=right)
            plt.plot(y, 'b', lw=1)
            plt.plot(peaks, y[peaks], "+", mec='r', mew=2, ms=8)
            plt.plot(np.zeros_like(y) + right, "--", color="gray")
            plt.title("find_peaks min_height:%7f" % right)
            plt.show()
        if method == 1:
            detect_peaks(y, mph=right, mpd=minimum_peak_distance, show=True)
        if method == 2:
            print('Detect peaks with minimum height and distance filters.')
            # thres=right/max(y)
            indexes = peakutils.peak.indexes(np.array(y),
                                             thres=right / max(y),
                                             min_dist=minimum_peak_distance)
            print('Peaks are: %s' % (indexes))
            plt.plot(y, 'b', lw=1)
            for i in indexes:
                plt.plot(i, y[i], "+", mec='r', mew=2, ms=8)
            plt.plot(np.zeros_like(y) + right, "--", color="gray")
            plt.title("peakutils.peak thres:%f ,minimum_peak_distance:%d" %
                      (right, minimum_peak_distance))
            plt.show()
    else:
        print("Wrong PV name, not in ", titleList)
Exemple #19
0
def main():
    #os.chdir('../') # Set working directory

    print("\nStarting program.\n")

    print("Loading data...\n")
    accidents_data = ld.AccidentsData()
    vehicles_data = ld.VehiclesData()
    merged_data = ld.MergedData(accidents_data, vehicles_data)
    X_test = merged_data.get_merged_test()
    y_test = merged_data.get_target_test()
    X_train = merged_data.get_merged_train()
    y_train = merged_data.get_target_train()

    print("Available Models:\n")
    print("1. K-nearest Neighbors")
    print("2. Stochastic Gradient Descent Classifier")
    print("3. Decision Tree Classifier")
    print("4. Random Forest Classifier")
    print("5. C-Support Vector Classification")
    print("6. Logistic Regression")
    print("7. Multi-Layer Perceptron Classifier")
    print("\n")

    mode = input("Choose Training Model: ")

    print('\nTraining model...\n')
    training = tr.Training(X_train, y_train)

    if mode == "1":
        training.knnTraining()
    elif mode == "2":
        training.sgdClassifierTraining()
    elif mode == "3":
        training.decisionTreeTraining()
    elif mode == "4":
        training.supportVectorMachinesTraining()
    elif mode == "5":
        training.supportVectorMachinesTraining()
    elif mode == "6":
        training.logisticRegressionTraining()
    elif mode == "7":
        training.mlpTraining()
    else:
        print("Bye!")
        quit()

    print('Calculating prediction...')
    y_pred = training.model.predict(X_test.drop('accident_id', axis=1))
    print('F1 score = ', f1_score(y_test,y_pred))
def main():
    title = "SVM Learning Curves (Contraceptive)"
    contracept_X, contracept_Y = LoadData.contraceptiveData()
    contraceptX_train, contraceptX_test, contraceptY_train, contraceptY_test = train_test_split(
        contracept_X, contracept_Y, test_size=0.30, random_state=100)
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
    # change the kernel here
    estimator = SVC(gamma=.001, C=1000.0, kernel='poly')
    plt, contracept_elapsed_time = plot_learning_curve(estimator,
                                                       title,
                                                       contraceptX_train,
                                                       contraceptY_train,
                                                       (0.1, 0.5),
                                                       cv=cv,
                                                       n_jobs=4)
    print("It took SVM (Contraceptive) {0}s to train".format(
        contracept_elapsed_time))
    estimator.fit(contraceptX_train, contraceptY_train)
    print(estimator.score(contraceptX_train, contraceptY_train))
    t0 = time()
    y_pred = estimator.predict(contraceptX_test)
    print("SVM (Contraceptive) Took {0}s to test".format(time() - t0))
    print("SVM Accuracy Score (Contraceptive) was {0}%".format(
        accuracy_score(contraceptY_test, y_pred) * 100))
    plt.show()

    title = "SVM Learning Curves (Wine)"
    wine_X, wine_Y = LoadData.wineData()
    wineX_train, wineX_test, wineY_train, wineY_test = train_test_split(
        wine_X, wine_Y, test_size=0.30, random_state=100)
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
    # change the kernel here
    estimator = SVC(gamma=.001, C=1000.0, kernel='rbf')
    plt, wine_elapsed_time = plot_learning_curve(estimator,
                                                 title,
                                                 wineX_train,
                                                 wineY_train, (0.1, 1.01),
                                                 cv=cv,
                                                 n_jobs=4)
    print("It took SVM (Wine) {0}s to train".format(wine_elapsed_time))
    estimator.fit(wineX_train, wineY_train)
    print(estimator.score(wineX_train, wineY_train))
    t0 = time()
    y_pred = estimator.predict(wineX_test)
    print("It took SVM (Wine) {0}s to test".format((time() - t0)))
    print("SVM Accuracy Score (Wine) was {0}%".format(
        accuracy_score(wineY_test, y_pred) * 100))
    plt.show()
    def __init__(self,
                 batch_size,
                 training_dataset_folder_name,
                 total_epochs,
                 epochs_with_same_data=5,
                 folders_at_the_same_time=20,
                 to_avoid=[],
                 enable_telegram_bot=True,
                 chat_id="undefined"):
        self.x, self.y, _ = LoadData.GetData(
            training_dataset_folder_name,
            limit_value=folders_at_the_same_time,
            to_avoid=to_avoid)
        self.y = np_utils.to_categorical(self.y, 2)
        self.x = self.x.astype('float32')
        self.x /= np.max(self.x)

        self.x_next_epoch, self.y_next_epoch = self.x, self.y
        self.epoch = 0
        self.batch_size = batch_size
        self.epochs_with_same_data = epochs_with_same_data
        self.training_dataset_folder_name = training_dataset_folder_name
        self.folders_at_the_same_time = folders_at_the_same_time
        self.to_avoid = to_avoid
        self.steps_per_epoch = 0
        self.t = None
        self.total_epochs = total_epochs
        self.enable_telegram_bot = enable_telegram_bot
        self.chat_id = chat_id
Exemple #22
0
def master(k):
    DataFile = './InputData/tmp.txt'
    x, y, m = LoadData.load("./InputData/origin.txt")
    
    LoadData.store(x, y, DataFile)
    _x, _y = k_means.getResult(DataFile, k = k, Flag = False)
    
    #Debug.show(_x[1], _y[1])
    Ans = []
    for master in _x:
        e = connector.transport(_x[master], _y[master], m)
        #print _x[master]
        #print _y[master]
        #print 'e =', e
        Ans.append([e, _x[master], _y[master]])
    return Ans
Exemple #23
0
def AgeHist():

    df = LoadData.readDataSet()
    df_age = df['Age']
    df_age_normal = df_age[df['label'] == 0]

    df_age_normal.hist(
        bins=40,
        grid=False).get_figure().savefig('D://tmsc_data/Age_distribution.png')

    Dict = {}
    for age in df_age_normal:
        if Dict.has_key(age):
            Dict[age] += 1
        else:
            Dict[age] = 1

    Dict = sorted(Dict.items(), key=lambda d: d[0], reverse=False)

    keylist = []
    vallist = []
    for key, val in Dict:
        keylist.append(key)
        vallist.append(val)
    print Dict
Exemple #24
0
def AgeSection():

    df = LoadData.readDataSet()

    df_features = df.drop(['label', 'id'], axis=1)
    df_features = df_features[df['label'] == 0]

    df_age56to59 = df_features[df['Age'] <= 59]  # 10 rows
    df_age56to59 = df_age56to59.drop('Age', axis=1)

    df_age60to70 = df_features[df['Age'] <= 70]  # 435 rows
    df_age60to70 = df_age60to70[df['Age'] >= 60]
    df_age60to70 = df_age60to70.drop('Age', axis=1)

    df_age71to82 = df_features[df['Age'] <= 82]  # 2092 rows
    df_age71to82 = df_age71to82[df['Age'] >= 71]
    df_age71to82 = df_age71to82.drop('Age', axis=1)

    df_age83to90 = df_features[df['Age'] >= 83]  # 515 rows
    df_age83to90 = df_age83to90[df['Age'] <= 90]
    df_age83to90 = df_age83to90.drop('Age', axis=1)

    df_age91to96 = df_features[df['Age'] >= 91]  # 24 rows
    df_age91to96 = df_age91to96.drop('Age', axis=1)

    return df_age56to59, df_age60to70, df_age71to82, df_age83to90, df_age91to96
def build_model(phi, restore=False):
    # pre-process
    Xinput, Xoutput, Phi, PhiT, Yinput = LD.pre_calculate(phi)

    # build model
    prediction, predictionSymmetric, transField = build_fista(Xinput,
                                                              Phi,
                                                              PhiT,
                                                              Yinput,
                                                              reuse=False)

    # loss function
    costMean, costSymmetric, costSparsity = compute_cost(
        prediction, predictionSymmetric, Xoutput, transField)
    costAll = costMean + 0.01 * costSymmetric + 0.001 * costSparsity
    optmAll = tf.train.AdamOptimizer(
        learning_rate=learningRate).minimize(costAll)

    # set up
    init = tf.global_variables_initializer()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
    sess = tf.Session(config=config)

    if restore is False:  # training
        sess.run(init)
        return sess, saver, Xinput, Xoutput, costAll, optmAll, Yinput, prediction
    else:  # reconstruction
        saver.restore(sess, '%s/%d.cpkt' % (modelDir, ncpkt))
        return sess, saver, Xinput, Xoutput, Yinput, prediction
def train(args):
    # Data loading
    data = DATA.LoadData(args.path, args.dataset)
    if args.verbose > 0:
        print("AFM: dataset=%s, factors=%s, attention=%d, freeze_fm=%d, #epoch=%d, batch=%d, lr=%.4f, lambda_attention=%.1e, keep=%s, optimizer=%s, batch_norm=%d, decay=%f, activation=%s"
              %(args.dataset, args.hidden_factor, args.attention, args.freeze_fm, args.epoch, args.batch_size, args.lr, args.lamda_attention, args.keep, args.optimizer, 
              args.batch_norm, args.decay, args.activation))
    activation_function = tf.nn.relu
    if args.activation == 'sigmoid':
        activation_function = tf.sigmoid
    elif args.activation == 'tanh':
        activation_function == tf.tanh
    elif args.activation == 'identity':
        activation_function = tf.identity
    
    save_file = make_save_file(args)
    # Training
    t1 = time()

    num_variable = data.truncate_features()
    if args.mla:
        args.freeze_fm = 1
    model = AFM(data.features_M, args.pretrain, save_file, args.attention, eval(args.hidden_factor), args.valid_dimen, 
        activation_function, num_variable, args.freeze_fm, args.epoch, args.batch_size, args.lr, args.lamda_attention, eval(args.keep), args.optimizer, 
        args.batch_norm, args.decay, args.verbose, args.mla)
    
    model.train(data.Train_data, data.Validation_data, data.Test_data)
    
    # Find the best validation result across iterations
    best_valid_score = 0
    best_valid_score = min(model.valid_rmse)
    best_epoch = model.valid_rmse.index(best_valid_score)
    print ("Best Iter(validation)= %d\t train = %.4f, valid = %.4f [%.1f s]" 
           %(best_epoch+1, model.train_rmse[best_epoch], model.valid_rmse[best_epoch], time()-t1))
Exemple #27
0
def main(students_file, rooms_file, out_format):
    loader = ld.LoadJSON()
    students = loader.load(filename=students_file)
    rooms = loader.load(filename=rooms_file)
    db = sql_functions.DBops()

    db.create_table()

    for query in sql_queries.INDEX_QUERY:
        db.select_query(query)

    db.insert_queries(rooms, students)

    db.commit()

    for select_num, query in enumerate(sql_queries.SELECT_QUERIES):
        result = db.select_query(query)
        try:
            if out_format.lower() == 'json':
                conversion_json = cd.JSONConversion()
                conversion_json.write(
                    result, 'select_' + query_name(select_num) + out_format)
            elif out_format.lower() == "xml":
                conversion_xml = cd.XMLConversion()
                conversion_xml.write(
                    result, 'select_' + query_name(select_num) + out_format)
            else:
                raise ex.FormatException('Please enter format json or xml')
        except ex.FormatException as fe:
            print(fe)
Exemple #28
0
def getPrecision(X_train, Y_train, X_val, Y_val):

    alg = RandomForestClassifier(n_estimators=50,
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 oob_score=True)

    alg.fit(X_train, Y_train)

    # oob模型准确率评估
    print('model oob_score:', alg.oob_score_)

    Y_predict = alg.predict_proba(X_val)[:, 1]  # 参数1表示预测该样本为正类的概率

    Y_predict[Y_predict <= 0.44] = 0
    Y_predict[Y_predict > 0.44] = 1

    precision = np.count_nonzero(Y_predict == Y_val) / len(X_val)

    print('model precision:', precision)

    #     return precision
    # 获取特征的importance
    featureList = LoadData.getFeatureName('D://tmsc_data/nameListFile.txt')

    feature_importances = alg.feature_importances_
    # 将特征名及其重要性分数对应
    Dict = {}
    for (predictor, score) in zip(featureList, feature_importances):
        Dict[predictor] = score

    # 对importance值进行排序
    Dict = sorted(Dict.items(), key=lambda d: d[1], reverse=True)
def train(FLAGS):
    # Data loading
    import pickle as pk
    data = DATA.LoadData(FLAGS.path, FLAGS.dataset)

    if FLAGS.verbose > 0:
        print(
            "FM: dataset=%s, embedding_size=%d,#epoch=%d, batch=%d, lr=%.4f, lambda=%.1e, keep=%s, metric=%s, optimizer=%s, batch_norm=%d"
            % (FLAGS.dataset, FLAGS.embedding_size, FLAGS.epoch,
               FLAGS.batch_size, FLAGS.lr, FLAGS.lamda, FLAGS.keep,
               FLAGS.metric, FLAGS.optimizer, FLAGS.batch_norm))

    # Training
    t1 = time()
    model = FM(data.features_M, FLAGS.pretrain, make_save_file(FLAGS),
               FLAGS.embedding_size, FLAGS.valid_dimen, FLAGS.epoch,
               FLAGS.metric, FLAGS.batch_size, FLAGS.lr, FLAGS.lamda,
               FLAGS.keep, FLAGS.optimizer, FLAGS.batch_norm, FLAGS.verbose)
    model.train(data.Train_data, data.Validation_data, data.Test_data)

    # Find the best validation result across iterations
    best_valid_score = 0
    best_valid_score = min(model.valid_rmse)
    best_epoch = model.valid_rmse.index(best_valid_score)
    print("Best Iter(validation)= %d\t train = %.4f, valid = %.4f [%.1f s]" %
          (best_epoch + 1, model.train_rmse[best_epoch],
           model.valid_rmse[best_epoch], time() - t1))
Exemple #30
0
def prepData():

    # load up files from disk
    training_data, kaggle_data = LoadData.load_data()
    features_in = [
        'Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
        'Resolution', 'Address'
        'X', 'Y'
    ]

    # break dates into month, day, year, day of week, hour
    # categorize category, month, day, year, dow, hour, district
    # scale lat (y), long(x)
    training_data['Year'] = (pd.DatetimeIndex(training_data['Dates']).year)
    training_data['Month'] = (pd.DatetimeIndex(training_data['Dates']).month)
    training_data['Day'] = (pd.DatetimeIndex(training_data['Dates']).day)
    training_data['Hour'] = (pd.DatetimeIndex(training_data['Dates']).hour)
    training_data['Minute'] = (pd.DatetimeIndex(training_data['Dates']).minute)

    # cast date as unix time
    training_data['UnixTime'] = (pd.DatetimeIndex(
        training_data['Dates'])).astype(np.int64) / 10000000000

    # day of week to number
    sorted_days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
                   'Friday', 'Saturday')

    def dayOfWeekNumber(d):
        return sorted_days.index(d)

    training_data['DayNumber'] = training_data['DayOfWeek'].apply(
        dayOfWeekNumber)

    # set up an id number for each category from alphabetical list
    # add to training_data
    categories = pd.unique(training_data['Category'])
    sorted_categories = (np.sort(categories)).tolist()

    def categoryNumber(category):
        return sorted_categories.index(category)

    training_data['CategoryNumber'] = training_data['Category'].apply(
        categoryNumber)

    districts = pd.unique(training_data['PdDistrict'])
    sorted_districts = (np.sort(districts)).tolist()

    def districtNumber(district):
        return sorted_districts.index(district)

    training_data['DistrictNumber'] = training_data['PdDistrict'].apply(
        districtNumber)

    # X is longitude, Y is latitude set ones outside city to median values
    training_data.loc[training_data.X > -122.0, 'X'] = training_data.X.median()
    training_data.loc[training_data.X < -123.0, 'X'] = training_data.X.median()
    training_data.loc[training_data.Y < 37.0, 'Y'] = training_data.Y.median()
    training_data.loc[training_data.Y > 38.0, 'Y'] = training_data.Y.median()

    return (training_data)
Exemple #31
0
def train(args):
    # Data loading
    data = DATA.LoadData(args.path, args.dataset)
    if args.verbose > 0:
        print(
            "FM: dataset=%s, factors=%d, #epoch=%d, batch=%d, lr=%.4f, lambda=%.1e, keep=%.2f, optimizer=%s, batch_norm=%d"
            %
            (args.dataset, args.hidden_factor, args.epoch, args.batch_size,
             args.lr, args.lamda, args.keep, args.optimizer, args.batch_norm))

    # Training
    t1 = time()
    model = FM(data.features_M, args.pretrain, make_save_file(args),
               args.hidden_factor, args.epoch, args.batch_size, args.lr,
               args.lamda, args.keep, args.optimizer, args.batch_norm,
               args.verbose, args.mla)
    model.train(data.Train_data, data.Validation_data, data.Test_data)

    # Find the best validation result across iterations
    best_valid_score = 0
    best_valid_score = min(model.valid_rmse)
    best_epoch = model.valid_rmse.index(best_valid_score)
    print(
        "Best Iter(validation)= %d\t train = %.4f, valid = %.4f [%.1f s], test = %.4f [%.1f s]"
        % (best_epoch + 1, model.train_rmse[best_epoch],
           model.valid_rmse[best_epoch], model.test_rmse[best_epoch],
           time() - t1))
Exemple #32
0
def w2v_ic(word, buckets=20):
    global ic_dict
    if not ic_dict:
        ic_dict = LoadData.load_ic()

    im = [0]*buckets
    cn = [0]*buckets

    if word in ic_dict:
        if ic_dict[word].IMAGEABILITY != None:
            im = bucket(ic_dict[word].IMAGEABILITY, 7., buckets)
	elif word in model:
	    for w2 in model.most_similar(word, topn=20):
		if w2[0] in ic_dict and ic_dict[w2[0]].IMAGEABILITY != None:
		    im = bucket(ic_dict[w2[0]].IMAGEABILITY, 7., buckets)
		    break

        if ic_dict[word].CONCRETENESS != None:
            cn = bucket(ic_dict[word].CONCRETENESS, 5., buckets)
	elif word in model:
	    for w2 in model.most_similar(word, topn=20):
		if w2[0] in ic_dict and ic_dict[w2[0]].CONCRETENESS != None:
		    cn = bucket(ic_dict[w2[0]].CONCRETENESS, 5., buckets)
		    break
    return cn + im
Exemple #33
0
    def ElaborateImagesAndMakePredition(self, inp_img):
        # crop a good percentage of the image in order to gain performances. found a good tradeoff with those values

        start_time = time.time()

        img_data_pipelined = FaceExtractionPipeline.SingletonPipeline(
        ).FaceExtractionPipelineImage(
            inp_img, math.ceil(np.shape(inp_img)[0] * 20 / 100),
            math.ceil(np.shape(inp_img)[0] * 40 / 100))

        if img_data_pipelined is not None:
            # plt.imshow(img_data_pipelined, 'gray')
            # plt.show()

            inp = LoadData.MergeImages(self.ref_img, img_data_pipelined)
            inp = np.expand_dims(inp, axis=0)

            #with self.graph.as_default():
            predicted_label = self.model.predict(inp)

            print(('same' if predicted_label[0, 1] > 0.975 else 'wrong') +
                  str(predicted_label))
            return True, predicted_label[0, 1]

        else:
            return False, 0

        print("--- %s seconds for a frame---" % (time.time() - start_time))
Exemple #34
0
def prepData():
    
    # load up files from disk
    training_data, kaggle_data = LoadData.load_data()    
    features_in = ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address' 'X', 'Y']
    

    
    # break dates into month, day, year, day of week, hour 
    # categorize category, month, day, year, dow, hour, district
    # scale lat (y), long(x)
    training_data['Year'] = (pd.DatetimeIndex(training_data['Dates']).year) 
    training_data['Month'] = (pd.DatetimeIndex(training_data['Dates']).month)
    training_data['Day'] = (pd.DatetimeIndex(training_data['Dates']).day)
    training_data['Hour'] = (pd.DatetimeIndex(training_data['Dates']).hour)
    training_data['Minute'] = (pd.DatetimeIndex(training_data['Dates']).minute)

  
    


    # cast date as unix time
    training_data['UnixTime'] = (pd.DatetimeIndex(training_data['Dates'])).astype(np.int64) / 10000000000

   
    # day of week to number
    sorted_days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday')
    def dayOfWeekNumber(d):
        return sorted_days.index(d)
    training_data['DayNumber'] = training_data['DayOfWeek'].apply(dayOfWeekNumber)
    
    
    # set up an id number for each category from alphabetical list
    # add to training_data
    categories = pd.unique(training_data['Category'])
    sorted_categories = (np.sort(categories)).tolist()

    def categoryNumber(category):
        return sorted_categories.index(category)
    training_data['CategoryNumber'] = training_data['Category'].apply(categoryNumber)
    
   
    
    districts = pd.unique(training_data['PdDistrict'])
    sorted_districts = (np.sort(districts)).tolist()
    
    def districtNumber(district):
        return sorted_districts.index(district)
    training_data['DistrictNumber'] = training_data['PdDistrict'].apply(districtNumber)
    
    
    # X is longitude, Y is latitude set ones outside city to median values
    training_data.loc[training_data.X > -122.0, 'X'] = training_data.X.median()
    training_data.loc[training_data.X < -123.0, 'X'] = training_data.X.median()
    training_data.loc[training_data.Y < 37.0, 'Y'] = training_data.Y.median()
    training_data.loc[training_data.Y > 38.0, 'Y'] = training_data.Y.median()

    
    return (training_data)
Exemple #35
0
def test() : 
	import LoadData
	trainMat, classLabelVector = LoadData.loadTrainDataFromCSV(TRAIN_FILE)
	trainMat = array(trainMat)
	testMat = LoadData.loadTestDataFromCSV(TEST_FILE)

	k = 3

	# testMat = trainMat[0:50]
	# i = 0
	# for testData in testMat : 
	# 	label = classify_kNN(testData, trainMat[50:], classLabelVector[50:], k)
	# 	print "the real answer is ", classLabelVector[i], "the label is ", label
	# 	i += 1

	for testData in testMat : 
		label = classify_kNN(testData, trainMat, classLabelVector, k)
def theanoScatterPCA(path, dataset):
    if dataset == 'mnist':
        print('Loading Mnist Data')
        (imageData, imageLabels) = LoadData.loadMNISTUnSplit(path, shared=False)
        print(imageData.shape)
    elif dataset == 'cifar':
        print('Loading Cifar Data')
        (imageData, imageLabels) = LoadData.loadCIFAR10UnSplit(path, shared=False)
        imageData = imageData / 255.
    print('Loaded')
    
    print("Computing Scatter Plot")
    labelIds = dict()
    for idx in range(len(imageLabels)):
        if str(imageLabels[idx]) not in labelIds:
            labelIds[str(imageLabels[idx])] = []
        labelIds[str(imageLabels[idx])].append(idx)

    fig, plots = plt.subplots(10, 10)
    fig.set_size_inches(50, 50)
    plt.prism()
    for i, j in product(xrange(10), repeat=2):
        if i > j:
            continue

        idx = labelIds[str(i)] + labelIds[str(j)]
        print('\tCalculating PCA For Classes %d And %d' %(i,j))
        X_transformed = runPCA(data=imageData, elems=idx, components=2)
        Y_ = imageLabels[labelIds[str(i)] + labelIds[str(j)]]
        plots[i, j].scatter(X_transformed[:, 0], X_transformed[:, 1], c=Y_)
        plots[i, j].set_xticks(())
        plots[i, j].set_yticks(())
      
        plots[j, i].scatter(X_transformed[:, 0], X_transformed[:, 1], c=Y_)
        plots[j, i].set_xticks(())
        plots[j, i].set_yticks(())
        if i == 0:
            plots[i, j].set_title(j)
            plots[j, i].set_ylabel(j)
    plt.tight_layout()
    plt.savefig('scatter/' + dataset + ".png")
    print("Computing Scatter Plot Finished")
Exemple #37
0
def UpdateStocks(getSyms=False, getQuotes=False):
        global SYMBOLS
        
        if getSyms is True:
                LoadData.downloadSymbols()        
        if getQuotes is True:
                LoadData._downloadStocks('nasdaq')
                LoadData._downloadStocks('nyse')

        SYMBOLS = LoadData._getSymbols()
Exemple #38
0
def AS(sym=None):
        global SYMBOLS
        
        UpdateStocks()
        if len(SYMBOLS) < 1 and sym is None:
                print("Not enough stocks loaded.")
                return
        
        # If a single stock symbol has been passed, only test that symbol.
        if isinstance(sym, str):
                print("Will test: " + sym)
                tester = PatProcess.Test(LoadData.historicalData(sym))
                tester.parse()

        # Otherwise, test them all.
        else:
                for s in SYMBOLS[1:20]:
                        print("Will test:" + s[0])
Exemple #39
0
def ic(word, buckets=5):
    if type(word) == tuple:
	word = word[0]

    global ic_dict
    if not ic_dict:
	ic_dict = LoadData.load_ic() 

    im = [0]*buckets
    cn = [0]*buckets

    if word in ic_dict:
	if ic_dict[word].IMAGEABILITY != None:
	    im = bucket(ic_dict[word].IMAGEABILITY, 7., buckets)
	if ic_dict[word].CONCRETENESS != None:
	    cn = bucket(ic_dict[word].CONCRETENESS, 5., buckets)

    return cn + im
def predict():
    parser = argparse.ArgumentParser(prog='Logistic Regression', conflict_handler='resolve',description = '''\
        This scripts predicts the classes according to a previously saved model of the provided dataset and saves it to the given output folder
        ''')
    parser.add_argument('-o', '--output', type=str, default="out", required=False, help='Path To The Output Folder')

    requiredNamed = parser.add_argument_group('Required Arguments')
    requiredNamed.add_argument('-m', '--model', type=str, required=True, help='The Previously Trained Model')
    requiredNamed.add_argument('-d', '--dataset', type=str, required=False, help='Path To The Dataset [MNIST]')

    parsed = parser.parse_args()
    if not os.path.exists(parsed.output):
        os.makedirs(parsed.output)

    params = loadParams(parsed.model)

    (train_images, train_labels), (validation_images, validation_labels), \
                (test_images, test_labels) = LoadData.loadMNIST(parsed.dataset)

    regressor = LogisticRegressor(input=test_images,labels=None, weights=params[0], bias=params[1])

    predict = theano.function(
        inputs=[],
        outputs=regressor.predictions
    )

    predictions = predict()
    hits = (predictions == test_labels.eval()).sum()
    accuracy = float(hits) / len(predictions)
    print('Num Predictions:\t%d' %(len(predictions)))
    print('Num Hits:\t\t%d' %(hits))
    print('Accuracy:\t\t%f' %(accuracy))
    out=''
    for idx in range(len(predictions)):
        out = out + str(predictions[idx]) + '\t'
        if (idx % 10) == 0:
            out = out[:-1] + '\n'
    with open(parsed.output + '/predictions.txt', 'w') as outf:
        outf.write(out)
    outf.close()
Exemple #41
0
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
             dataset='mnist', batch_size=20, n_hidden=500):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L1_reg: float
    :param L1_reg: L1-norm's weight when added to the cost (see
    regularization)

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


   """

    datasets = Ld.load_share(dataset)
    if dataset=="mnist":
       im_sz = [28,28]
       num_label = 10
       patience = 5000  # look as this many examples regardless
       patience_increase = 2  # wait this much longer when a new best is
                                      # found
       improvement_threshold = 0.995  # a relative improvement of this much is
                                      # considered significant
    elif dataset=="emotion":
        im_sz = [48,48]
        num_label = 10
        patience = 5000  
        patience_increase = 2 
        improvement_threshold = 0.995

    test_set_x, test_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    train_set_x, train_set_y = datasets[2]
    
    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    rng = numpy.random.RandomState(1234)

    # construct the MLP class
    classifier = MLP(rng=rng, input=x, n_in=im_sz[0] * im_sz[1],
                     n_hidden=n_hidden, n_out=num_label)

    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = classifier.negative_log_likelihood(y) \
         + L1_reg * classifier.L1 \
         + L2_reg * classifier.L2_sqr

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens={
                x: test_set_x[index * batch_size:(index + 1) * batch_size],
                y: test_set_y[index * batch_size:(index + 1) * batch_size]})

    validate_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens={
                x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                y: valid_set_y[index * batch_size:(index + 1) * batch_size]})

    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = []
    for param in classifier.params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs
    updates = []
    # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
    # same length, zip generates a list C of same size, where each element
    # is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    for param, gparam in zip(classifier.params, gparams):
        updates.append((param, param - learning_rate * gparam))

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(inputs=[index], outputs=cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                     (epoch, minibatch_index + 1, n_train_batches,
                      this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                           improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i
                                   in xrange(n_test_batches)]
                    test_score = numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                    done_looping = True
                    break

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def train_autoencoder():

    ## parses the provided parameters according to the command line input
    parser = argparse.ArgumentParser(prog='AutoEncoder', conflict_handler='resolve',description = '''\
        This script should enable the user to train his AutoEncoder according to the input parameters
        ''')
    parser.add_argument('-l', '--learningrate', type=float, default=0.025, required=False, help='The Learning Rate')
    parser.add_argument('-b', '--batchsize', type=int, default=20, required=False, help='Batch Size For Training')
    parser.add_argument('-h', '--reducedUnits', type=int, default=30,  required=False, help='Number of Reduced Layer Units')
    parser.add_argument('-o', '--output', type=str, default="out", required=False, help='Path To The Output Folder')
    parser.add_argument('-1', '--l1reg', type=float, default=0.1, required=False, help='Value For L1 Regularisaion')
    parser.add_argument('-k', '--kul_leib_penalty', type=float, default=0.04, required=False, help='Value For Kullback Leiber Divergence Penalty')
    parser.add_argument('-k', '--kul_leib_beta', type=float, default=1.0, required=False, help='Controls The Weight Of The Sparsity Penalty Term')
    parser.add_argument('-s', '--sparsity', type=str, default='l1reg', choices=['l1reg', 'kul_leib'], required=False, help='Choose Which Penalty Should Be Used')
    parser.add_argument('-e', '--epochs', type=int, default=500, required=False, help='Number Of Epochs')
    parser.add_argument('-m', '--momentum', type=float, default=0.9, required=False, help='The Momentum Rate')


    requiredNamed = parser.add_argument_group('Required Arguments')
    requiredNamed.add_argument('-d', '--dataset', type=str, required=True, help='Path To The Training Set (MNIST)')
   
    parsed = parser.parse_args()

    if parsed.sparsity == 'kul_leib':
        assert parsed.kul_leib_penalty < 0.05
        outpath_raw = parsed.output + "/kul_leib"
    else:
        outpath_raw = parsed.output + "/l1reg"

    if not os.path.exists(outpath_raw):
        os.makedirs(outpath_raw)

    (train_images, train_labels), (validation_images, validation_labels), \
         (test_images, test_labels) = LoadData.loadMNIST(parsed.dataset)#, shuffle=True)

    number_train_images_batches = train_images.get_value(borrow=True).shape[0] // parsed.batchsize
    number_test_images_batches = test_images.get_value(borrow=True).shape[0] // parsed.batchsize
    number_validation_images_batches = validation_images.get_value(borrow=True).shape[0] // parsed.batchsize

    index = T.lscalar() 
    imageData = T.matrix('imageData')

    rng = np.random.RandomState(1234)##numpy random range generator

    autoencoder = AutoEncoder(
        input=imageData,
        rng=rng,
        n_input=28*28, ##image 28x28
        n_reduced=parsed.reducedUnits,
        sparsity_param=parsed.kul_leib_penalty,
        beta=parsed.kul_leib_beta,
        n_reconstructed=28*28
    )

    if parsed.sparsity == 'l1reg':
        cost_sparse = (
            autoencoder.cost
            + parsed.l1reg * abs(autoencoder.reducedLayer.weights).sum()
        )
    else:
        cost_sparse = (
            autoencoder.cost + autoencoder.kul_leib
        )



    updates = (
        gradient_updates_momentum(cost_sparse, autoencoder.params, parsed.learningrate, parsed.momentum)
    )


    trainBatchGivenIndex = theano.function(
        inputs=[index],
        outputs= cost_sparse,
        updates= updates,
        givens={
            imageData: train_images[index * parsed.batchsize: (index + 1) * parsed.batchsize]
        }
    )

    validateBatchGivenIndex = theano.function(
        inputs=[index],
        outputs= cost_sparse,
        givens={
            imageData: validation_images[index * parsed.batchsize: (index + 1) * parsed.batchsize]
        }
    )

    patience = 5000
    patience_increase = 2
    improvement_threshold = 0.995 
    best_validation_loss = np.inf
    best_validation_epoch = 0

    val_freq = min(number_train_images_batches, patience // 2)
    epoch = 0

    # improvement_threshold = 0.995 
    # lowest_cost = np.inf
    # best_minibatch = -1
    # best_epoch = -1
    encoder_name = None
    if parsed.sparsity == 'l1reg':
        encoder_name = 'encoder_' + str(parsed.l1reg) + '_l1'
    else:
        encoder_name = 'encoder_' + str(parsed.kul_leib_beta) + '_kul_leib'
    
    done_looping = False
    while (epoch < parsed.epochs) and not (done_looping):
        epoch = epoch + 1
        for minibatch_index in range(number_train_images_batches):
            minibatch_squared_error_loss = trainBatchGivenIndex(minibatch_index)
            idx = (epoch - 1) * number_train_images_batches + minibatch_index

            if (idx + 1) % val_freq == 0:
                validation_losses = [validateBatchGivenIndex(currentValidationBatch)
                                     for currentValidationBatch in range(number_validation_images_batches)]
                this_validation_loss = np.mean(validation_losses)
                print("Epoch %d, Batch Index: %d / %d, Accuracy On Validation Samples: %f" \
                    % (epoch, minibatch_index,  number_train_images_batches, this_validation_loss))       
                
                if this_validation_loss < best_validation_loss:
                    if this_validation_loss < best_validation_loss *  \
                           improvement_threshold:
                        patience = max(patience, idx * patience_increase)
                        best_validation_epoch = epoch

                    autoencoder.save(outpath_raw, encoder_name)
                    lowest_cost = this_validation_loss
                    best_validation_loss = this_validation_loss
                    best_epoch = epoch
                    best_minibatch = minibatch_index
            if patience <= idx:
                done_looping = True
                break

    print('Saved Model With Respect To Epoch %d , Minibatch %d And Cost Of %f' % \
           (best_epoch, best_minibatch, lowest_cost))

    reconstruct_images = theano.function(
        inputs=[],
        outputs=autoencoder.reconstruction,
        givens={
            imageData: test_images[:100]
        }
    )

    reconstructed_images = reconstruct_images()
    reconstructed_images.reshape(100,28,28)# * 255


    outpath = None
    if parsed.sparsity == 'l1reg':
        outpath = outpath_raw + '/reconstruct_' + str(parsed.l1reg) + '_l1.png'
    else:
        outpath = outpath_raw + '/reconstruct_' + str(parsed.kul_leib_beta) + '_kul_leib.png'

    arraysToImgs(rows=10,colums=10,arr=reconstructed_images,path=outpath,out_shape=(28,28))
def prepData():
    
    # load up files from disk
    training_data, kaggle_data = LoadData.load_data()    
    features_in = ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address' 'X', 'Y']
    

    # break dates into month, day, year, day of week, hour 
    # categorize category, month, day, year, dow, hour, district
    # scale lat (y), long(x)
    training_data['Year'] = (pd.DatetimeIndex(training_data['Dates']).year - 2000) 
    training_data['Month'] = (pd.DatetimeIndex(training_data['Dates']).month) 
    training_data['Day'] = (pd.DatetimeIndex(training_data['Dates']).day) 
    training_data['Hour'] = (pd.DatetimeIndex(training_data['Dates']).hour) 
    training_data['Minute'] = (pd.DatetimeIndex(training_data['Dates']).minute) 

    kaggle_data['Year'] = (pd.DatetimeIndex(kaggle_data['Dates']).year - 2000) 
    kaggle_data['Month'] = (pd.DatetimeIndex(kaggle_data['Dates']).month) 
    kaggle_data['Day'] = (pd.DatetimeIndex(kaggle_data['Dates']).day) 
    kaggle_data['Hour'] = (pd.DatetimeIndex(kaggle_data['Dates']).hour)
    kaggle_data['Minute'] = (pd.DatetimeIndex(kaggle_data['Dates']).minute) 
    
    

    # cast date as unix time
    training_data['UnixTime'] = (pd.DatetimeIndex(training_data['Dates'])).astype(np.int64) / 10000000000
    kaggle_data['UnixTime'] = (pd.DatetimeIndex(kaggle_data['Dates'])).astype(np.int64) / 10000000000

   
    # day of week to number
    sorted_days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday')
    def dayOfWeekNumber(d):
        return sorted_days.index(d)
    training_data['DayNumber'] = (training_data['DayOfWeek'].apply(dayOfWeekNumber)) 
    kaggle_data['DayNumber'] = (kaggle_data['DayOfWeek'].apply(dayOfWeekNumber)) 
    
    
    # set up an id number for each category from alphabetical list
    # add to training_data
    categories = pd.unique(training_data['Category'])
    sorted_categories = (np.sort(categories)).tolist()

    def categoryNumber(category):
        return sorted_categories.index(category)
    training_data['CategoryNumber'] = training_data['Category'].apply(categoryNumber)
    
    # no categories for validation data, that's what we're trying to figure out
    # add output array for validation set just for convience 
    kaggle_data['CategoryNumber'] = 0
    
    
    # scale lat and long
    def scaleLat(lat):
        return lat - 37.0
    training_data['ScaledLatitude'] = training_data['Y'].apply(scaleLat) 
    kaggle_data['ScaledLatitude'] = kaggle_data['Y'].apply(scaleLat)
    
    def scaleLong(long):
        return long + 122.0
    training_data['ScaledLongitude'] = training_data['X'].apply(scaleLong)
    kaggle_data['ScaledLongitude'] = kaggle_data['X'].apply(scaleLong)
    
    
    districts = pd.unique(training_data['PdDistrict'])
    sorted_districts = (np.sort(districts)).tolist()
    
    def districtNumber(district):
        return sorted_districts.index(district)
    training_data['DistrictNumber'] = (training_data['PdDistrict'].apply(districtNumber)) / 9.
    kaggle_data['DistrictNumber'] = (kaggle_data['PdDistrict'].apply(districtNumber)) / 9.
    

  

    # split inputs from outputs
    features = ['Year', 'Month', 'Day', 'Hour', 'DayNumber', 'DistrictNumber', 'ScaledLatitude', 'ScaledLongitude']
    training_x = training_data[features]
    training_y = training_data['CategoryNumber']
    
    kaggle_x = kaggle_data[features]
    kaggle_y = kaggle_data['CategoryNumber']
    

    # create a testing and validation set from the training_data
    x_train, x_split, y_train, y_split = cross_validation.train_test_split(training_x, training_y, test_size=0.2)
    x_test, x_validate, y_test, y_validate = cross_validation.train_test_split(x_split, y_split, test_size=0.5)
    
    
    # convert from dataframe to arrays of arrays
    train_x = x_train.as_matrix()
    test_x = x_test.as_matrix()
    validate_x = x_validate.as_matrix()
    x_kaggle = kaggle_x.as_matrix()
    
    y_train = y_train.as_matrix()
    y_test = y_test.as_matrix()
    y_validate = y_validate.as_matrix()
    kaggle_y = kaggle_y.as_matrix()
    
    
    # package them up
    training_set = (train_x, y_train)
    validation_set = (validate_x, y_validate)
    test_set = (x_test, y_test)
    kaggle_set = (x_kaggle, kaggle_y) 
    
    print (training_x.head())
    print(len(kaggle_y))
    
   
    return training_set, validation_set, test_set, kaggle_set
def test_cnn(dataset_matrix_r, label_vector_r, learning_rate=0.1, n_epochs=120, nkerns=[30, 90], batch_size=250):

    # Load dataset
    datasets = LoadData.load_data_multi(dataset_matrix_r, label_vector_r)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    # Construct the model
    print "... building the model"

    index = T.lscalar()
    x = T.matrix("x")
    y = T.ivector("y")

    rng = np.random.RandomState(1234)

    layer0_input = x.reshape((batch_size, 5, 5, 10))
    layer0_input = layer0_input.dimshuffle(0, 3, 1, 2)

    layer0 = ConvPoolLayer.ConvPoolLayer(
        rng, layer0_input, filter_shape=(nkerns[0], 10, 3, 3), image_shape=(batch_size, 10, 5, 5)
    )

    layer1 = ConvPoolLayer.ConvPoolLayer(
        rng, layer0.output, filter_shape=(nkerns[1], nkerns[0], 3, 3), image_shape=(batch_size, nkerns[0], 3, 3)
    )

    layer3 = MultiLayerPerceptron.HiddenLayer(rng, layer1.output.flatten(2), nkerns[1], 120, activation=T.tanh)

    layer5 = LogisticLayer.LogisticLayer(layer3.output, 120, 9)

    cost = layer5.negative_log_likelihood(y)

    # Function to train the model
    params = layer5.params + layer3.params + layer1.params + layer0.params
    gparams = T.grad(cost, params)
    updates = [(param, param - learning_rate * gparam) for param, gparam in zip(params, gparams)]
    train_model = theano.function(
        inputs=[index],
        outputs=[cost],
        updates=updates,
        givens={
            x: train_set_x[index * batch_size : (index + 1) * batch_size],
            y: train_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    # Functions to test and validate the model
    valid_model = theano.function(
        inputs=[index],
        outputs=[layer5.errors(y)],
        givens={
            x: valid_set_x[index * batch_size : (index + 1) * batch_size],
            y: valid_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    test_model = theano.function(
        inputs=[index],
        outputs=[layer5.errors(y)],
        givens={
            x: test_set_x[index * batch_size : (index + 1) * batch_size],
            y: test_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    print "... training the model"
    patience = 10000
    patience_increase = 2
    improvement_threshold = 0.995
    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.0
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print "training @ iter = ", iter

            train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                validation_losses = [valid_model(i) for i in xrange(n_valid_batches)]
                this_validation_loss = np.mean(validation_losses)
                print (
                    "epoch %i, minibatch %i/%i, validation error %f %%"
                    % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0)
                )

                if this_validation_loss < best_validation_loss:

                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = np.mean(test_losses)
                    print (
                        ("     epoch %i, minibatch %i/%i, test error of " "best model %f %%")
                        % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.0)
                    )

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print ("Optimization complete.")
    print (
        "Best validation score of %f %% obtained at iteration %i, "
        "with test performance %f %%" % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0)
    )
    print >> sys.stderr, (
        "The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0)
    )

    return params
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = np.mean(test_losses)
                    print (
                        ("     epoch %i, minibatch %i/%i, test error of " "best model %f %%")
                        % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.0)
                    )

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print ("Optimization complete.")
    print (
        "Best validation score of %f %% obtained at iteration %i, "
        "with test performance %f %%" % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0)
    )
    print >> sys.stderr, (
        "The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0)
    )

    return params


if __name__ == "__main__":
    dataset_matrix, label_vector, dataset_matrix_r, label_vector_r = LoadData.preprocess_data()
    params = test_cnn(dataset_matrix_r, label_vector_r)
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
             dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
    
    # Load dataset
    datasets = LoadData.load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    # Construct the model
    print '... building the model'

    index = T.lscalar()  
    x = T.matrix('x')  
    y = T.ivector('y')  

    rng = np.random.RandomState(1234)
    
    classifier = MultiLayerPerceptron.MLP(rng, x, 28*28, n_hidden, 10)
    cost = classifier.negative_log_likelihood(y) + L1_reg*classifier.L1 + L2_reg*classifier.L2
    
    # Function to train the model
    gparams = [T.grad(cost, param) for param in classifier.params]
    updates = [(param, param - learning_rate*gparam) for param, gparam in zip(classifier.params, gparams)]
    train_model = theano.function(inputs=[index],
                                  outputs=[cost],
                                  updates=updates,
                                  givens={x:train_set_x[index*batch_size: (index+1) * batch_size],
                                          y:train_set_y[index*batch_size: (index+1) * batch_size]})
                                          
    # Functions to test and validate the model
    valid_model = theano.function(inputs=[index],
                                  outputs=[classifier.errors(y)],
                                  givens={x:valid_set_x[index * batch_size: (index+1) * batch_size],
                                          y:valid_set_y[index * batch_size: (index+1) * batch_size]})
                                          
    test_model = theano.function(inputs=[index],
                                 outputs=[classifier.errors(y)],
                                 givens={x:test_set_x[index * batch_size: (index+1) * batch_size],
                                         y:test_set_y[index * batch_size: (index+1) * batch_size]})
                                         
    # Train the model
    print 'Training the model ...'
    
    patience = 10000  
    patience_increase = 2  
    improvement_threshold = 0.995  
    validation_frequency = min(n_train_batches, patience / 2)
                                
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            train_model(minibatch_index)
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                validation_losses = [valid_model(i) for i in xrange(n_valid_batches)]
                this_validation_loss = np.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch,
                                                                             minibatch_index + 1,
                                                                             n_train_batches,
                                                                             this_validation_loss * 100.) )

                if this_validation_loss < best_validation_loss:
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = np.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of best model %f %%') % (epoch, 
                                                                                                minibatch_index + 1, 
                                                                                                n_train_batches,
                                                                                                test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemple #47
0
__author__ = 'computer'
import sys
import numpy as np
import SGD
import LoadData

dataDic, labelDic = LoadData.loadData("horseColicTraining.txt")
w = SGD.gd(dataDic, labelDic, alpha = 0.001, epochs = 1000)
#w = SGD.sgd(dataDic, labelDic, alpha = 0.001, epochs = 500)
dataDic, labelDic = LoadData.loadData("horseColicTest.txt")
h = np.mat(dataDic).dot(w)
cnt = 0
for i in range(len(labelDic)):
    if h[i] >= 0.5 and labelDic[i] >= 0.5:
        cnt += 1
    elif h[i] < 0.5 and labelDic[i] <= 0.5 :
        cnt += 1
print(cnt / len(labelDic))
# Kaggle MNIST submissions
# started with .... 
# http://neuralnetworksanddeeplearning.com/chap1.html source code from "Neural Networks and Deep Learning" Nielsen

##################################################################################
##################################################################################
# to do
# improve network
# maybe better stats and some graphical output




# read in the data files and format as needed
import LoadData
training_data, validation_data, test_data, kaggle_data = LoadData.load_data_wrapper()



########## multi layer network ######################################################
#~ 96.8% accurate first pass
import MultiLayer

# create the network
net = MultiLayer.Network([784, 120, 60, 10])  # layer sizes ( input, hidden, output )

epochs = 30         # number of passes through full data set
batch_size = 5     # size of batches, network updated once per batch
alpha = 1.2         # learning step
lmbda =  0.00005        # regularization 
net.sgd(training_data, epochs, batch_size, alpha, lmbda, test_data=test_data) # train epochs, batch size, alpha
Exemple #49
0
"""
Created on Dec 5, 2015

@author: Joe
"""
import LoadData as ld
import numpy as np
import pandas as pd
import HFModel as hf
from sklearn.cross_validation import train_test_split
from sklearn import metrics

if __name__ == "__main__":
    # H3_Test = ld.loadData('data/H3/Testing_01_21_1358755201.mat')
    H3 = ld.loadData("data/H3/Tagged_Training_07_30_1343631601.mat")
    hf.dataPrep(H3.HF, np.array(H3.tagInfo))

    X = H3.HF.drop(["Timestamp", "Back Porch Lights"], axis=1)
    Y = H3.HF["Back Porch Lights"]

    # Set randomness so that we all get the same answer
    np.random.seed(841)

    # Split the data into train and test pieces for both X and Y
    X_train, X_test, Y_train, Y_test = train_test_split(X.head(2000), Y.head(2000), train_size=0.80)
    model = hf.HFModel(X_train, Y_train)
    print "Accuracy on test = %.3f" % metrics.accuracy_score(model.predict(X_test), Y_test)

    # print(H3.L1.head(5))
    # print(ld.getApplianceData(H3.HF, H3.tagInfo).head(1))
# -*- coding: utf-8 -*-
import LoadData


dataset_matrix, label_vector, dataset_matrix_r, label_vector_r = LoadData.preprocess_data()

datasets = LoadData.load_data_multi(dataset_matrix_r, label_vector_r)


def trainRegressor():
    parser = argparse.ArgumentParser(prog='Logistic Regression', conflict_handler='resolve',description = '''\
        This script should enable the user to train his Logistic Regression Model according to the input parameters
        ''')
    parser.add_argument('-l', '--learningrate', type=float, default=0.01, required=False, help='The Learning Rate')
    parser.add_argument('-b', '--batchsize', type=int, default=20, required=False, help='The Batch Size')
    parser.add_argument('-o', '--output', type=str, default="out", required=False, help='Path To The Output Folder')
    parser.add_argument('-e', '--epochs', type=int, default=200, required=False, help='Maximum Number Of Epochs')
    parser.add_argument('-p', '--plot', type=bool, default=False, required=False, help='Set To True In Order To Plot Error Curves')


    requiredNamed = parser.add_argument_group('Required Arguments')
    requiredNamed.add_argument('-d', '--dataset', type=str, required=True, help='Path To The Training Set')
   
    parsed = parser.parse_args()

    if not os.path.exists(parsed.output):
        os.makedirs(parsed.output)

    (train_images, train_labels), (validation_images, validation_labels), \
         (test_images, test_labels) = LoadData.loadMNIST(parsed.dataset)

    number_train_images_batches = train_images.get_value(borrow=True).shape[0] // parsed.batchsize
    number_validation_images_batches = validation_images.get_value(borrow=True).shape[0] // parsed.batchsize
    number_test_images_batches = test_images.get_value(borrow=True).shape[0] // parsed.batchsize

    index = T.lscalar() 
    imageData = T.matrix('imageData')
    imageLabels = T.ivector('imageLabels')

    regressor = LogisticRegressor(input=imageData, labels=imageLabels, n_in=28 * 28, n_out= 10)

    trainBatchGivenIndex = theano.function(
        inputs=[index],
        outputs= [regressor.cost],
        updates= [(regressor.weights, regressor.weights - parsed.learningrate * T.grad(cost=regressor.cost, wrt=regressor.weights)),
               (regressor.bias, regressor.bias - parsed.learningrate * T.grad(cost=regressor.cost, wrt=regressor.bias))],
        givens={
            imageData: train_images[index * parsed.batchsize: (index + 1) * parsed.batchsize],
            imageLabels: train_labels[index * parsed.batchsize: (index + 1) * parsed.batchsize]
        }
    )

    trainAccuracyGivenIndex = theano.function(
        inputs=[index],
        outputs=regressor.missclassified,
        givens={
            imageData: train_images[index * parsed.batchsize: (index + 1) * parsed.batchsize],
            imageLabels: train_labels[index * parsed.batchsize: (index + 1) * parsed.batchsize]
        }
    )

    valdiationAccuracyGivenIndex = theano.function(
        inputs=[index],
        outputs=regressor.missclassified,
        givens={
            imageData: validation_images[index * parsed.batchsize: (index + 1) * parsed.batchsize],
            imageLabels: validation_labels[index * parsed.batchsize: (index + 1) * parsed.batchsize]
        }
    )

    testAccuracyGivenIndex = theano.function(
        inputs=[index],
        outputs=regressor.missclassified,
        givens={
            imageData: test_images[index * parsed.batchsize: (index + 1) * parsed.batchsize],
            imageLabels: test_labels[index * parsed.batchsize: (index + 1) * parsed.batchsize]
        }
    )

    patience = 5000
    patience_increase = 2
    improvement_threshold = 0.995 
    best_validation_loss = np.inf
    best_validation_epoch = 0
    best_testing_loss = np.inf
    best_testing_epoch = 0
    test_score = 0.
 
    if parsed.plot:
        trainRes = [[],[]]
        valRes = [[],[]]
        testRes = [[],[]]

    done_looping = False
    val_freq = min(number_train_images_batches, patience // 2)
    epoch = 0

    while epoch < parsed.epochs and (not done_looping):
        epoch = epoch + 1

        for minibatch_index in range(number_train_images_batches):

            minibatch_avg_cost = trainBatchGivenIndex(minibatch_index)
            idx = (epoch - 1) * number_train_images_batches + minibatch_index

            if (idx + 1) % val_freq == 0:
                validation_losses = [valdiationAccuracyGivenIndex(currentValidationBatch)
                                     for currentValidationBatch in range(number_validation_images_batches)]
                this_validation_loss = np.mean(validation_losses)
                print("Epoch %d, Batch Index: %d / %d, Accuracy On Validation Samples: %f" \
                    % (epoch, minibatch_index,  number_train_images_batches, (100 - this_validation_loss * 100)))       

                if this_validation_loss < best_validation_loss:
                    if this_validation_loss < best_validation_loss *  \
                           improvement_threshold:
                        patience = max(patience, idx * patience_increase)
                        best_validation_epoch = epoch

                    best_validation_loss = this_validation_loss
                    test_losses = [testAccuracyGivenIndex(currentTestBatch)
                                   for currentTestBatch in range(number_test_images_batches)]
                    test_score = np.mean(test_losses)
                    print("\tEpoch %d, Batch Index: %d / %d, Accuracy On Test Samples: %f" \
                        % (epoch, minibatch_index,  number_train_images_batches, (100 - test_score * 100)))
                    if test_score < best_testing_loss:
                        print('\t\tNew Best Test Score\n\t\tSaving Network')
                        best_testing_loss = test_score
                        best_testing_epoch = epoch
                        regressor.saveRegressor(parsed.output)

            if patience <= idx:
                done_looping = True
                break

        if parsed.plot:
            print("Collecting Accuracy After Epoch %d" % (epoch))
            trainRes[1].append(np.mean([trainAccuracyGivenIndex(currentTrainBatch) \
                     for currentTrainBatch in range(number_train_images_batches)]) *100)
            valRes[1].append(np.mean([valdiationAccuracyGivenIndex(currentValidationBatch) \
                     for currentValidationBatch in range(number_validation_images_batches)]) *100)
            testRes[1].append(np.mean([testAccuracyGivenIndex(currentTestBatch) \
                     for currentTestBatch in range(number_test_images_batches)]) *100)
            trainRes[0].append(epoch)
            valRes[0].append(epoch)
            testRes[0].append(epoch)
   

    print('Optimization complete with best test score of %f %%,'
             % (100 - best_testing_loss * 100.))
    if parsed.plot:
        plotError(trainRes, valRes, testRes, parsed.output, 'error.png')
import numpy as np
import pandas as pd
import pickle

from datetime import datetime
from sklearn import cross_validation
from sklearn.cluster import Birch, KMeans



############################################################################
# read in csv files
import LoadData

# load up files from disk
training_data, kaggle_data = LoadData.load_data()    

############################################################################
"""
# cluster locations
def clusterLocations():

    t_location = training_data[['X', 'Y']]
    k_location = kaggle_data[['X', 'Y']]
    
    clf = KMeans(n_clusters=640)    # 23,104 unique addresses in training set
    clf.fit(t_location)
   
    
    training_data['Location'] = clf.predict(t_location)
    kaggle_data['Location'] = clf.predict(k_location)
Exemple #53
0
def main():
    
    #---------------------------PUT THE SEGMENT YOU WANT TO START PREDICTING FROM----------------------
    CURRENT_SEGMENT=1
    inning=1
    
    #-----------------------------------------------------
    
    train_data=LoadData.getTraindata(inning)
    test_data=LoadData.getTestData(CURRENT_SEGMENT, inning)
    
    HRMods= HRModel.getHRModels(train_data)
    NHRMods= NHRModel.getNHRModels(train_data)
    
    
    #Start predicting for each segment
    fileFolder="../Predict EOI/Results/Final PPT/Inn"+str(inning)+"-StartSeg"+str(CURRENT_SEGMENT)+"-HR_NHR-"
    maeFile=fileFolder+"MAE.csv"
    fwriter=open(maeFile,"w")
    fwriter.write("Segment,HR MAE in Segment,NHR MAE in Segment,HR MAE till Segment,NHR MAE till Segment,Total MAE")
    fwriter.write("\n")
    
    df = pd.DataFrame(0, index=np.arange(len(test_data)), columns=['Predicted Total HR till Segment'])
    hr_runs = gl.SFrame(data=df)
    df = pd.DataFrame(0, index=np.arange(len(test_data)), columns=['Predicted Total NHR till Segment'])
    nhr_runs = gl.SFrame(data=df)
    
    test_data['Predicted Total HR till Segment']=hr_runs['Predicted Total HR till Segment']
    test_data['Predicted Total NHR till Segment']=nhr_runs['Predicted Total NHR till Segment']
        
    while CURRENT_SEGMENT<=10:
        print "--------------Segment "+ str(CURRENT_SEGMENT)+" Started-----------------"  
        predict_HR= HRModel.getPredictedHomeRun(HRMods, train_data, CURRENT_SEGMENT, test_data)
        predict_NHR= NHRModel.getPredictedNHR(NHRMods, CURRENT_SEGMENT, test_data)
        
       
        
        test_data=updateHRFeatures(test_data, predict_HR)
        test_data=updateNHRFeatures(test_data, predict_NHR)
        
        #write to FILE
        filename=fileFolder+str(CURRENT_SEGMENT)+".csv"
        
        keylist=['Team','Total Matches Played','Runs Scored','Wickets Lost','Got All Out','Runs Conceded','Opponent Wickets Taken','Opponent All Out','Match Index','Batsman','Player Total Runs','Player Home Runs','Player Non Home Runs','Balls Faced','R0','W0','R1','W1','Current_HR','Current_NHR','Target','Final Runs Made','Extras','Home','Segment','Home Run Hitting Ability','Milestone Reaching Ability','Batting Average','Strike Rate','Matches Played','ClusterID','Predicted Total HR till Segment','Predicted Total NHR till Segment','Actual HR in Segment','Predicted HR in Segment','Actual NHR in Segment','Predicted NHR in Segment','Predicted Total runs till Segment']
        test_data=test_data.select_columns(keylist)
        test_data.save(filename, format='csv')
        
        
        mae_string=getStatistics(train_data, test_data, CURRENT_SEGMENT)
        
        print ""
        print "MAE = ", mae_string
        fwriter.write(str(CURRENT_SEGMENT)+","+mae_string)
        fwriter.write("\n")
        
        
        CURRENT_SEGMENT=CURRENT_SEGMENT+1
    
    print ""    
    print "-----------Prediction Done!---------------"   
    return 
#!/python


import numpy
import pandas as pd

import matplotlib.pyplot as plt
import random

import LoadData
    
training_data, validation_data = LoadData.load_data()    
    
print("Training examples", len(training_data))
print("Validation examples", len(validation_data))   
    
print(training_data.columns)  
features = ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address' 'X', 'Y']


###################################################################################################################
# map data
####################################################################################################################
# http://stackoverflow.com/questions/14329691/covert-latitude-longitude-point-to-a-pixels-x-y-on-mercator-projection
def mercator_projection(latitude, longitude):
    scale = 100.0
   
    x = (longitude + 180.0) * (scale / 360.0)
    latitude_radians = latitude * numpy.pi/180.0
   
    y3 = numpy.log(numpy.tan(numpy.pi/4.0 + latitude_radians/2.0))
def predict_cnn(nkerns=[20, 40, 60], batch_size=200): 
    
    # Load dataset
    datasets = LoadData.load_predict('VisionFeatures/dct12')

    #train_set_x, train_set_y = datasets[0]
    #valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    #n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    #n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
    
    weights = sio.loadmat('weights20')
    layer0_W = weights['layer0_W']
    layer0_b = weights['layer0_b']
    layer0_b = np.reshape(layer0_b, (layer0_b.shape[1],))
    layer1_W = weights['layer1_W']
    layer1_b = weights['layer1_b']
    layer1_b = np.reshape(layer1_b, (layer1_b.shape[1],))
    layer2_W = weights['layer2_W']
    layer2_b = weights['layer2_b']
    layer2_b = np.reshape(layer2_b, (layer2_b.shape[1],))
    layer3_W = weights['layer3_W']
    layer3_b = weights['layer3_b']
    layer3_b = np.reshape(layer3_b, (layer3_b.shape[1],))
    layer5_W = weights['layer5_W']
    layer5_b = weights['layer5_b']
    layer5_b = np.reshape(layer5_b, (layer5_b.shape[1],))


    # Construct the model
    print '... building the model'

    index = T.lscalar()  
    x = T.matrix('x')  
    y = T.ivector('y')  

    rng = np.random.RandomState(1234)
    
    layer0_input = x.reshape((batch_size, 72, 88, 1))
    layer0_input = layer0_input.dimshuffle(0, 3, 1, 2)

    layer0 = ConvPoolLayer.ConvPoolLayer(rng, 
                                         layer0_input, 
                                         filter_shape=(nkerns[0], 1, 9, 9),
                                         image_shape=(batch_size, 1, 72, 88),
                                         W=layer0_W,
                                         b=layer0_b)
                                        
    layer1 = ConvPoolLayer.ConvPoolLayer(rng,
                                         layer0.output,
                                         filter_shape=(nkerns[1], nkerns[0], 9, 9),
                                         image_shape=(batch_size, nkerns[0], 32, 40),
                                         W=layer1_W,
                                         b=layer1_b)
                                         
    layer2 = ConvPoolLayer.ConvPoolLayer(rng,
                                         layer1.output,
                                         filter_shape=(nkerns[2], nkerns[1], 5, 5),
                                         image_shape=(batch_size, nkerns[1], 12, 16),
                                         W=layer2_W,
                                         b=layer2_b)

                                         
    layer3 = MultiLayerPerceptron.HiddenLayer(rng, 
                                              layer2.output.flatten(2),
                                              nkerns[2] * 4 * 6, 
                                              600,
                                              W=layer3_W,
                                              b=layer3_b,
                                              activation=T.tanh)
                                              
                                      
    layer5 = LogisticLayer.LogisticLayer(layer3.output, 600, 6, W=layer5_W, b=layer5_b)
                                          
    predict_model = theano.function(inputs=[index],
                                    outputs=[layer5.errors(y)],
                                    givens={x:test_set_x[index * batch_size: (index+1) * batch_size],
                                            y:test_set_y[index * batch_size: (index+1) * batch_size]})
                                            
    prediction_losses = [predict_model(i) for i in xrange(n_test_batches)]
    this_prediction_loss = np.mean(prediction_losses)
    
    print this_prediction_loss
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600):
    """ Stochastic Gradient Descent for logistic Regression """
    
    # Load dataset and create batches
    datasets = LoadData.load_data(dataset)
    
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size


    # Construct the model
    print 'Building the model ... '
        
    index = T.iscalar('index')    
    x = T.dmatrix('x')
    y = T.ivector('y')
    
    classifier = LogisticLayer.LogisticLayer(x, 28*28, 10)
    cost = classifier.negative_log_likelihood(y)
    
    # Function to train the model
    gW = T.grad(cost, classifier.W)
    gb = T.grad(cost, classifier.b)
    updates = [(classifier.W, classifier.W - learning_rate * gW),
               (classifier.b, classifier.b - learning_rate * gb)]
    train_model = theano.function(inputs=[index], 
                                  outputs=[cost],
                                  updates=updates,
                                  givens={x:train_set_x[index * batch_size: (index + 1) * batch_size],
                                          y: train_set_y[index * batch_size: (index + 1) * batch_size]})
                                          
    # Functions to test and validate the model
    valid_model = theano.function(inputs=[index],
                                  outputs=[classifier.errors(y)],
                                  givens={x:valid_set_x[index * batch_size: (index+1) * batch_size],
                                          y:valid_set_y[index * batch_size: (index+1) * batch_size]})
                                          
    test_model = theano.function(inputs=[index],
                                 outputs=[classifier.errors(y)],
                                 givens={x:test_set_x[index * batch_size: (index+1) * batch_size],
                                         y:test_set_y[index * batch_size: (index+1) * batch_size]})
                                         
    # Train the model
    print 'Training the model ...'
    
    patience = 5000  
    patience_increase = 2  
    improvement_threshold = 0.995  
    validation_frequency = min(n_train_batches, patience / 2)
                                  
    best_validation_loss = np.inf
    test_score = 0.
    start_time = time.clock()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            train_model(minibatch_index)
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                validation_losses = [valid_model(i) for i in xrange(n_valid_batches)]
                this_validation_loss = np.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch,
                                                                             minibatch_index + 1,
                                                                             n_train_batches,
                                                                             this_validation_loss * 100.) )

                if this_validation_loss < best_validation_loss:
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss

                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = np.mean(test_losses)

                    print( ('     epoch %i, minibatch %i/%i, test error of best model %f %%') % (epoch,
                                                                                                 minibatch_index + 1,
                                                                                                 n_train_batches,
                                                                                                 test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print(
        (
            'Optimization complete with best validation score of %f %%,'
            'with test performance %f %%'
        )
        % (best_validation_loss * 100., test_score * 100.)
    )
    print 'The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
    # Fit the training data to the Survived labels and create the decision trees
    forest = forest.fit(train_x,train_y)
    #find training and cv error
    trainpred = forest.predict(train_x).astype(int)
    cvpred = forest.predict(cv_x).astype(int)
    terr = 1-np.sum(trainpred == train_y)/trainpred.shape[0]
    cverr = 1-np.sum(cvpred == cv_y)/cvpred.shape[0]
    
    # Take the same decision trees and run it on the test data
    output = forest.predict(test_x).astype(int)

    return terr,cverr,output


#load data and seperate into train and cv
data_x, data_y, test_x, headings, submission = LoadData.loadcleandata()
fraction = 0.66


###MAKE PREDICTIONS FOR SUBMISSION###############
##nummodels = 100
##predictions = np.zeros((test_x.shape[0],nummodels))
##for i in range(nummodels):
##    rseed = np.random.randint(1)
##    train_x,cv_x,train_y,cv_y = sklearn.cross_validation.train_test_split(data_x,data_y,train_size=int(fraction*data_x.shape[0]),random_state=rseed)
##    #select important features using randomized logreg
####    rlrtrain_x,rlrcv_x,rlrtest_x = randomlr(train_x,train_y,cv_x,test_x,regp=1,alpha=0.5)
####    terr,cverr,testpred = forestit(rlrtrain_x,train_y,rlrcv_x,cv_y,rlrtest_x,n_est=50)
##    #train and predict
##    terr,cverr,testpred = forestit(train_x,train_y,cv_x,cv_y,test_x,n_est=100)
##    predictions[:,i] = testpred
Exemple #58
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset="mnist.pkl.gz", nkerns=[20, 50], batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = LoadData.load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix("x")  # the data is presented as rasterized images
    y = T.ivector("y")  # the labels are presented as 1D vector of
    # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print "... building the model"

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(
        rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 5, 5),
        poolsize=(2, 2),
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticLayer.LogisticLayer(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size : (index + 1) * batch_size],
            y: test_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size : (index + 1) * batch_size],
            y: valid_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size : (index + 1) * batch_size],
            y: train_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print "... training"
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.0
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print "training @ iter = ", iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print (
                    "epoch %i, minibatch %i/%i, validation error %f %%"
                    % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0)
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = numpy.mean(test_losses)
                    print (
                        ("     epoch %i, minibatch %i/%i, test error of " "best model %f %%")
                        % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.0)
                    )

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print ("Optimization complete.")
    print (
        "Best validation score of %f %% obtained at iteration %i, "
        "with test performance %f %%" % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0)
    )
    print >> sys.stderr, (
        "The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0)
    )
Exemple #59
0
import FormatData as FD
import LoadData as dataLoader
from random import shuffle

#Get the data files
print("Getting data..")
positiveTweets = dataLoader.readFile('tweets/pos_tweets.txt')
negativeTweets = dataLoader.readFile('tweets/neg_tweets.txt')
#Make tweetsets the same size
positiveSameL, negativeSameL = FD.sameSize(positiveTweets,negativeTweets)

#Apply feature reduction so things like stopwords and emoticons are removed

print("Reducing features..")
reducedPos = []
for sentence in positiveSameL:
    reducedPos.append(FD.featureReduction(sentence))

reducedNeg = []
for sentence in negativeSameL:
    reducedNeg.append(FD.featureReduction(sentence))

trainData = []
for tweet in reducedPos:
    polarity = []
    polarity.append(tweet)
    polarity.append('1')
    trainData.append(polarity)

for tweet in reducedNeg:
    polarity = []
def prepData():
    
    # load up files from disk
    training_data, kaggle_data = LoadData.load_data()    
    features_in = ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address' 'X', 'Y']
    

    # break dates into month, day, year, day of week, hour 
    # categorize category, month, day, year, dow, hour, district
    # scale lat (y), long(x)
    training_data['Year'] = (pd.DatetimeIndex(training_data['Dates']).year) 
    training_data['Month'] = (pd.DatetimeIndex(training_data['Dates']).month)
    training_data['Day'] = (pd.DatetimeIndex(training_data['Dates']).day)
    training_data['Hour'] = (pd.DatetimeIndex(training_data['Dates']).hour)
    training_data['Minute'] = (pd.DatetimeIndex(training_data['Dates']).minute)

    kaggle_data['Year'] = (pd.DatetimeIndex(kaggle_data['Dates']).year) 
    kaggle_data['Month'] = (pd.DatetimeIndex(kaggle_data['Dates']).month)
    kaggle_data['Day'] = (pd.DatetimeIndex(kaggle_data['Dates']).day)
    kaggle_data['Hour'] = (pd.DatetimeIndex(kaggle_data['Dates']).hour)
    kaggle_data['Minute'] = (pd.DatetimeIndex(kaggle_data['Dates']).minute)
    


    # cast date as unix time
    training_data['UnixTime'] = (pd.DatetimeIndex(training_data['Dates'])).astype(np.int64) / 10000000000
    kaggle_data['UnixTime'] = (pd.DatetimeIndex(kaggle_data['Dates'])).astype(np.int64) / 10000000000

   
    # day of week to number
    sorted_days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday')
    def dayOfWeekNumber(d):
        return sorted_days.index(d)
    training_data['DayNumber'] = (training_data['DayOfWeek'].apply(dayOfWeekNumber))
    kaggle_data['DayNumber'] = (kaggle_data['DayOfWeek'].apply(dayOfWeekNumber))
    
    
    # set up an id number for each category from alphabetical list
    # add to training_data
    categories = pd.unique(training_data['Category'])
    sorted_categories = (np.sort(categories)).tolist()

    def categoryNumber(category):
        return sorted_categories.index(category)
    training_data['CategoryNumber'] = training_data['Category'].apply(categoryNumber)
    
    # no categories for validation data, that's what we're trying to figure out
    # add output array for validation set just for convience 
    kaggle_data['CategoryNumber'] = 0
  
    print("min/max category", min(training_data['CategoryNumber']), max(training_data['CategoryNumber']))
    
    
    
    districts = pd.unique(training_data['PdDistrict'])
    sorted_districts = (np.sort(districts)).tolist()
    
    def districtNumber(district):
        return sorted_districts.index(district)
    training_data['DistrictNumber'] = (training_data['PdDistrict'].apply(districtNumber))
    kaggle_data['DistrictNumber'] = (kaggle_data['PdDistrict'].apply(districtNumber))
    
    
    
  

    # split inputs from outputs
    features = ['Year', 'Month', 'Day', 'Hour', 'X', 'Y', 'DayNumber', 'DistrictNumber', 'CategoryNumber']
    
    training_data = training_data[features]
    print("pre split ", len(training_data))
    
      # split training and testing
      ##### to do , training and testing might contain some duplicates? how to avoid this?
    testing_data = training_data.sample(frac=0.2, replace=False)
    training_data = training_data.sample(frac=0.8, replace=False)
    
    print("post split", len(training_data))
    print("test", len(testing_data))
    
    
    
    data = np.array(training_data)
    x = data[:, 0:8]
    y = data[:, 8]
    dump_svmlight_file(x, y, 'train.svm')
    
    
    data = np.array(testing_data)
    x = data[:, 0:8]
    y = data[:, 8]
    dump_svmlight_file(x, y, 'test.svm')
    
    
    kaggle_data = kaggle_data[features]
    data = np.array(kaggle_data)
    x = data[:, 0:8]
    y = data[:, 8]
    dump_svmlight_file(x, y, 'kaggle.svm')
    
    # sanity check data
    print(training_data.head())
    print(x[0])
    print(y[0])