Python ImputationMissingValues 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: Chapter3.ImputationMissingValues

hotexamples.com에서의 예제들: 6

Python ImputationMissingValues - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 Chapter3.ImputationMissingValues.ImputationMissingValues에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

impute_interpolate(6)

ImputationMissingValues(2)

예제 #1

파일 보기

# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles/'
dataset = pd.read_csv(dataset_path + 'chapter3_final_outliers.csv',
                      index_col=0)
dataset.index = dataset.index.to_datetime()

# print(dataset.isnull().sum())
# print(dataset.shape)

# Computer the number of milliseconds covered by an instane based on the first two rows
milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

# Step 2: Let us impute the missing values.

MisVal = ImputationMissingValues()
# imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_Heart Rate')
# imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_Heart Rate')
# imputed_interpolation_dataset = MisVal.impute_interpolate(copy.deepcopy(dataset), 'hr_Heart Rate')
# DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_Heart Rate', imputed_mean_dataset['hr_Heart Rate'], imputed_interpolation_dataset['hr_Heart Rate'])

# And we impute for all columns except for the label in the selected way (interpolation)

for col in [c for c in dataset.columns if not 'label' in c]:
    dataset = MisVal.impute_interpolate(dataset, col)

# Let us try the Kalman filter on the light_phone_lux attribute and study the result.

# original_dataset = pd.read_csv(dataset_path + 'chapter3_final_outliers.csv', index_col=0)
# original_dataset.index = original_dataset.index.to_datetime()
# KalFilter = KalmanFilters()

예제 #2

파일 보기

# Let is create our visualization class again.
DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles/'
dataset = pd.read_csv(dataset_path + 'chapter3_result_outliers.csv',
                      index_col=0)
dataset.index = dataset.index.to_datetime()

# Computer the number of milliseconds covered by an instane based on the first two rows
milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

# Step 2: Let us impute the missing values.

MisVal = ImputationMissingValues()
imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset),
                                          'hr_watch_rate')
imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset),
                                              'hr_watch_rate')
imputed_interpolation_dataset = MisVal.impute_interpolate(
    copy.deepcopy(dataset), 'hr_watch_rate')
# DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate'])

print imputed_interpolation_dataset['hr_watch_rate']
DataViz.plot_imputed_values(dataset, ['original', 'interpolation'],
                            'hr_watch_rate',
                            imputed_interpolation_dataset['hr_watch_rate'])

# And we impute for all columns except for the label in the selected way (interpolation)

예제 #3

파일 보기

    raise e

# We'll create an instance of our visualization class to plot the results.
DataViz = VisualizeDataset(__file__)

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

# impute_columns = ['acc_mobile_x', 'acc_mobile_y', 'acc_mobile_z', 'gyr_mobile_x','gyr_mobile_y','gyr_mobile_z','mag_mobile_x','mag_mobile_y','mag_mobile_z','prox_mobile_distance',
#                        'loc_mobile_latitude','loc_mobile_longitude','loc_mobile_height','loc_mobile_velocity','loc_mobile_direction','loc_mobile_horizontalAccuracy','loc_mobile_verticalAccuracy']
#

# Let us impute the missing values and plot an example.

MisVal = ImputationMissingValues()
imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset),
                                          'acc_mobile_x')
imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset),
                                              'acc_mobile_x')
imputed_interpolation_dataset = MisVal.impute_interpolate(
    copy.deepcopy(dataset), 'acc_mobile_x')
DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'],
                            'acc_mobile_x',
                            imputed_mean_dataset['acc_mobile_x'],
                            imputed_interpolation_dataset['acc_mobile_x'])

# Now, let us carry out that operation over all columns except for the label.

for col in [c for c in dataset.columns if not 'label' in c]:
    dataset = MisVal.impute_interpolate(dataset, col)

예제 #4

파일 보기

# Let is create our visualization class again.
DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles-own/'
dataset = pd.read_csv(dataset_path + 'chapter3_result_outliers-own.csv',
                      index_col=0)
dataset.index = dataset.index.to_datetime()

# Computer the number of milliseconds covered by an instane based on the first two rows
milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

# Step 2: Let us impute the missing values.

MisVal = ImputationMissingValues()
imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset),
                                          'acc_phone_x')
imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset),
                                              'acc_phone_x')
imputed_interpolation_dataset = MisVal.impute_interpolate(
    copy.deepcopy(dataset), 'acc_phone_x')
#DataViz.plot_imputed_values(dataset, ['original', 'mean', 'median', 'interpolation'], 'acc_phone_x', imputed_mean_dataset['acc_phone_x'], imputed_median_dataset['acc_phone_x'], imputed_interpolation_dataset['acc_phone_x'])

# And we impute for all columns except for the label in the selected way (interpolation)

for col in [
        c for c in dataset.columns
        if not 'label' and not 'light_phone_lux' in c
]:
    dataset = MisVal.impute_interpolate(dataset, col)

예제 #5

파일 보기

dataset_path = './intermediate_datafiles/'
dataset_own = pd.read_csv(dataset_path + 'chapter3_result_outliers_own.csv',
                          index_col=0)
dataset_cs = pd.read_csv(dataset_path + 'chapter3_result_outliers_cs.csv',
                         index_col=0)

# Create index
dataset_own.index = dataset_own.index.to_datetime()
dataset_cs.index = dataset_cs.index.to_datetime()

# Computer the number of milliseconds covered by an instane based on the first two rows
milliseconds_per_instance = (dataset_own.index[1] -
                             dataset_own.index[0]).microseconds / 1000

# Step 2: Let us impute the missing values for CS (plot only).
MisVal = ImputationMissingValues()
imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset_cs),
                                          'hr_watch_rate')
imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset_cs),
                                              'hr_watch_rate')
imputed_interpolation_dataset = MisVal.impute_interpolate(
    copy.deepcopy(dataset_cs), 'hr_watch_rate')
DataViz.plot_imputed_values(dataset_cs, ['original', 'mean', 'interpolation'],
                            'hr_watch_rate',
                            imputed_mean_dataset['hr_watch_rate'],
                            imputed_interpolation_dataset['hr_watch_rate'])

# Make plot for own dataset
MisVal = ImputationMissingValues()
imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset_own),
                                          'press_phone_pressure')

예제 #6

파일 보기

def main():
    # Import the data from the specified location and parse the date index
    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of our visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Compute the number of milliseconds covered by an instance based on the first two rows
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Create objects for value imputation, low pass filter and PCA
    MisVal = ImputationMissingValues()
    LowPass = LowPassFilter()
    PCA = PrincipalComponentAnalysis()

    if FLAGS.mode == 'imputation':
        # Impute the missing values and plot an example
        imputed_mean_dataset = MisVal.impute_mean(
            dataset=copy.deepcopy(dataset), col='hr_watch_rate')
        imputed_median_dataset = MisVal.impute_median(
            dataset=copy.deepcopy(dataset), col='hr_watch_rate')
        imputed_interpolation_dataset = MisVal.impute_interpolate(
            dataset=copy.deepcopy(dataset), col='hr_watch_rate')
        DataViz.plot_imputed_values(
            dataset, ['original', 'mean', 'median', 'interpolation'],
            'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'],
            imputed_median_dataset['hr_watch_rate'],
            imputed_interpolation_dataset['hr_watch_rate'])

    elif FLAGS.mode == 'kalman':
        # Using the result from Chapter 2, try the Kalman filter on the light_phone_lux attribute and study the result
        try:
            original_dataset = pd.read_csv(DATA_PATH / ORIG_DATASET_FNAME,
                                           index_col=0)
            original_dataset.index = pd.to_datetime(original_dataset.index)
        except IOError as e:
            print(
                'File not found, try to run previous crowdsignals scripts first!'
            )
            raise e
        KalFilter = KalmanFilters()
        kalman_dataset = KalFilter.apply_kalman_filter(
            data_table=original_dataset, col='acc_phone_x')
        DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'],
                                    'acc_phone_x',
                                    kalman_dataset['acc_phone_x_kalman'])
        DataViz.plot_dataset(data_table=kalman_dataset,
                             columns=['acc_phone_x', 'acc_phone_x_kalman'],
                             match=['exact', 'exact'],
                             display=['line', 'line'])

    elif FLAGS.mode == 'lowpass':
        # Apply a lowpass filter and reduce the importance of the data above 1.5 Hz
        # Determine the sampling frequency
        fs = float(1000) / milliseconds_per_instance

        # Study acc_phone_x
        new_dataset = LowPass.low_pass_filter(
            data_table=copy.deepcopy(dataset),
            col='acc_phone_x',
            sampling_frequency=fs,
            cutoff_frequency=FLAGS.cutoff,
            order=10)
        DataViz.plot_dataset(
            new_dataset.iloc[int(0.4 * len(new_dataset.index)
                                 ):int(0.43 * len(new_dataset.index)), :],
            ['acc_phone_x', 'acc_phone_x_lowpass'], ['exact', 'exact'],
            ['line', 'line'])

    elif FLAGS.mode == 'PCA':
        # First impute again, as PCA can not deal with missing values
        for col in [c for c in dataset.columns if 'label' not in c]:
            dataset = MisVal.impute_interpolate(dataset, col)

        # Determine the PC's for all but the target columns (the labels and the heart rate)
        selected_predictor_cols = [
            c for c in dataset.columns
            if (not ('label' in c)) and (not (c == 'hr_watch_rate'))
        ]
        pc_values = PCA.determine_pc_explained_variance(
            data_table=dataset, cols=selected_predictor_cols)
        cumulated_variance = np.cumsum(pc_values)

        # Plot the explained variance and cumulated variance
        comp_numbers = np.arange(1, len(pc_values) + 1)
        DataViz.plot_xy(x=[comp_numbers, comp_numbers],
                        y=[pc_values, cumulated_variance],
                        xlabel='principal component number',
                        ylabel='explained variance',
                        ylim=[0, 1],
                        line_styles=['b-', 'r-'],
                        names=['Variance', 'Cumulated variance'])

        # Select 7 as the best number of PC's as this explains most of the variance
        n_pcs = 7
        dataset = PCA.apply_pca(data_table=copy.deepcopy(dataset),
                                cols=selected_predictor_cols,
                                number_comp=n_pcs)

        # Visualize the result of the PC's and the overall final dataset
        DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'],
                             ['line', 'points'])
        DataViz.plot_dataset(dataset, [
            'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_',
            'press_phone_', 'pca_', 'label'
        ], [
            'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like',
            'like'
        ], [
            'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points',
            'points'
        ])

    elif FLAGS.mode == 'final':
        # Carry out that operation over all columns except for the label
        print('Imputing missing values.')
        for col in tqdm([c for c in dataset.columns if 'label' not in c]):
            dataset = MisVal.impute_interpolate(dataset=dataset, col=col)

        # Include all measurements that have a form of periodicity and filter them
        periodic_measurements = [
            'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
            'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y',
            'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
            'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x',
            'mag_watch_y', 'mag_watch_z'
        ]

        print('Applying low pass filter on peridic measurements.')
        # Determine the sampling frequency.
        fs = float(1000) / milliseconds_per_instance
        for col in tqdm(periodic_measurements):
            dataset = LowPass.low_pass_filter(data_table=dataset,
                                              col=col,
                                              sampling_frequency=fs,
                                              cutoff_frequency=FLAGS.cutoff,
                                              order=10)
            dataset[col] = dataset[col + '_lowpass']
            del dataset[col + '_lowpass']

        # Use the optimal found parameter n_pcs = 7 to apply PCA to the final dataset
        selected_predictor_cols = [
            c for c in dataset.columns
            if (not ('label' in c)) and (not (c == 'hr_watch_rate'))
        ]
        n_pcs = 7
        dataset = PCA.apply_pca(data_table=copy.deepcopy(dataset),
                                cols=selected_predictor_cols,
                                number_comp=n_pcs)

        # Visualize the final overall dataset
        DataViz.plot_dataset(dataset, [
            'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_',
            'press_phone_', 'pca_', 'label'
        ], [
            'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like',
            'like'
        ], [
            'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points',
            'points'
        ])

        # Store the final outcome
        dataset.to_csv(DATA_PATH / RESULT_FNAME)