Ejemplo n.º 1
0
from ay_hw_3.util_data import load_data_and_label, get_all_datasets_path
from ay_hw_3.util_statistic import gen_statistic_result
from ay_hw_3._global import ROOT_PATH

import pandas as pd
import numpy as np
import pprint

if __name__ == "__main__":
    allFilePaths = get_all_datasets_path(rootPath=ROOT_PATH)

    statisticResult = pd.DataFrame()
    for index, path in enumerate(allFilePaths):
        fileItem, fileLabel = load_data_and_label(path)
        staticResultItem = gen_statistic_result(fileItem, index + 1)
        statisticResult = statisticResult.append(staticResultItem)

    ##----Same to the main_c_ii.py-------------------
    confidence_interval = {}
    for column in statisticResult.columns:
        itemCIRange = []
        for i in range(0, 999):
            # Return a random sample of items from an axis of object.
            ran_sample = statisticResult[column].sample(n=10, replace=True)
            stat = ran_sample.std()
            itemCIRange.append(stat)
        itemCIRange.sort()
        lowerValue = np.percentile(itemCIRange, 0.05)
        upperValue = np.percentile(itemCIRange, 0.95)
        confidence_interval[column] = [lowerValue, upperValue]
Ejemplo n.º 2
0
    trainStaticResult = pd.DataFrame()
    testStaticResult = pd.DataFrame()
    gaussianTestErrorRateList = list()

    for parts in range(1, 10):
        for index, path in enumerate(allTrainFilePaths):
            trainFileItem, trainFileLabel = load_data_and_label(path,
                                                                hasTime=False)
            gluedTrainFile = split_DF_in_parts(trainFileItem,
                                               parts=parts,
                                               needConcat=True)
            gluedTrainFile.columns = gen_multiple_column_name(parts=parts,
                                                              hasTime=False)
            trainStaticResultItem = gen_statistic_result(gluedTrainFile,
                                                         index + 1,
                                                         hasTime=False)
            trainStaticResultItem["label"] = convert_label_2_num(
                trainFileLabel)
            trainStaticResult = trainStaticResult.append(trainStaticResultItem,
                                                         sort=False)

        for index, path in enumerate(allTestFilePaths):
            testFileItem, testFileLabel = load_data_and_label(path,
                                                              hasTime=False)
            gluedTestFile = split_DF_in_parts(testFileItem,
                                              parts=parts,
                                              needConcat=True)
            gluedTestFile.columns = gen_multiple_column_name(parts=parts,
                                                             hasTime=False)
            testStaticResultItem = gen_statistic_result(gluedTestFile,
Ejemplo n.º 3
0
if __name__ == "__main__":

	if not sys.warnoptions:
		warnings.simplefilter("ignore")

	allTrainFilePaths = gen_train_data_file_paths()
	# based on what the pdf said, we need to use all training data
	trainStaticResult = pd.DataFrame()
	for parts in range(1, 21):
		for index, path in enumerate(allTrainFilePaths):
			fileItem, fileLabel = load_data_and_label(path, hasTime=False)
			splitedDFs = split_DF_in_parts(fileItem, parts=parts, needConcat=False)
			statisticResultTemp = pd.DataFrame()
			for DFItem in splitedDFs:
				staticResultTempItem = gen_statistic_result(DFItem, index + 1, hasTime=False)
				statisticResultTemp = statisticResultTemp.append(staticResultTempItem, sort=False)

			statisticResultTemp["label"] = is_bending(fileLabel)
			trainStaticResult = trainStaticResult.append(statisticResultTemp, sort=False)

		logitModel = sm.Logit(trainStaticResult['label'],
							  trainStaticResult[gen_multiple_label(parts=1)])
		logitModelResults = logitModel.fit(method="bfgs",disp=0)
		# ['median(1)'] ['max(5)']
		significantVars = \
			[key for key, p_value in logitModelResults.pvalues.items() if p_value <= 0.05]
		if len(significantVars) > 0:
			print("When split all training data sets in {} times, "
				  "I got significant variables : ".format(parts), end=" ")
			print(' '.join(significantVars))
Ejemplo n.º 4
0
if __name__ == "__main__":
    simplefilter(action='ignore', category=FutureWarning)
    # the best l I got is 3
    bestL = 3
    selectedFeatures = ['min(5)', '3rd quart(5)', '3rd quart(7)', 'max(18)']
    allTrainFilePaths = gen_train_data_file_paths()
    # based on what the pdf said, we need to use all training data
    statisticResult = pd.DataFrame()
    for index, path in enumerate(allTrainFilePaths):
        fileItem, fileLabel = load_data_and_label(path, hasTime=False)
        gluedFile = split_DF_in_parts(fileItem, parts=bestL, needConcat=True)
        gluedFile.columns = gen_multiple_column_name(parts=bestL,
                                                     hasTime=False)
        staticResultItem = gen_statistic_result(gluedFile,
                                                index + 1,
                                                hasTime=False)
        staticResultItem["label"] = is_bending(fileLabel)
        statisticResult = statisticResult.append(staticResultItem, sort=False)

    X_trainData = statisticResult[selectedFeatures]
    y_trainData = statisticResult['label']

    skLogitModel = LogisticRegression()
    skLogitModel.fit(X_trainData, y_trainData)
    skYPredict = skLogitModel.predict(X_trainData)

    falsePositiveRate, truePositiveRate, thresholds = roc_curve(
        y_trainData, skYPredict)
    # compute Area Under the Curve (AUC) using the trapezoidal rule
    area = auc(falsePositiveRate, truePositiveRate)
Ejemplo n.º 5
0
#
__author__ = 'Aaron Yang'
__email__ = '*****@*****.**'
__date__ = '9/27/2019 11:13 PM'
from ay_hw_3.util_data import convert_label_2_num, load_data_and_label
from ay_hw_3.util_generate import gen_test_data_file_paths
from ay_hw_3.util_statistic import gen_statistic_result
from ay_hw_3._global import FULL_COLUMNS, ROOT_PATH

if __name__ == "__main__":
    print(convert_label_2_num("bending1"))
    print(convert_label_2_num("bending2"))

    print(convert_label_2_num("cycling"))
    print(convert_label_2_num("sitting"))
    print(convert_label_2_num("walking"))
    print(type(convert_label_2_num("lying")))

    # print(gen_test_data_file_paths('.\\assets'))

    dataframe, label = load_data_and_label('.\\assets\\cycling\\dataset2.csv')
    staticResultItem = gen_statistic_result(dataframe, 1)
    print(staticResultItem.to_string())