Esempio n. 1
0
def main():
    # datasets-related info
    task_path_list = glob.glob(os.path.join(datasets_path, 'raw/*'))
    task_name_list = [task_path.split('/')[-1] for task_path in task_path_list]

    # load raw datasets
    datasets_raw = []
    for task_path in task_path_list:
        task_csv_path = os.path.join(task_path, 'csv')
        print('Loading data from: ' + task_csv_path)
        demo_path_list = glob.glob(os.path.join(
            task_csv_path, '201*'))  # the prefix of dataset file
        demo_temp = []
        for demo_path in demo_path_list:
            data_csv = pd.read_csv(
                os.path.join(demo_path,
                             'multiModal_states.csv'))  # the file name of csv
            demo_temp.append({
                'stamp':
                (data_csv.values[:, 2].astype(int) - data_csv.values[0, 2]) *
                1e-9,
                'left_hand':
                np.hstack([
                    data_csv.values[:, 207:210].astype(
                        float),  # human left hand position
                    data_csv.values[:, 7:15].astype(float),  # emg
                ]),
                'left_joints':
                data_csv.values[:, 317:324].astype(float)  # robot ee actually
            })
        datasets_raw.append(demo_temp)

    # filter the datasets: gaussian_filter1d
    datasets_filtered = []
    for task_idx, task_data in enumerate(datasets_raw):
        print('Filtering data of task: ' + task_name_list[task_idx])
        demo_norm_temp = []
        for demo_data in task_data:
            time_stamp = demo_data['stamp']
            # filter the datasets
            left_hand_filtered = gaussian_filter1d(demo_data['left_hand'].T,
                                                   sigma=sigma).T
            left_joints_filtered = gaussian_filter1d(
                demo_data['left_joints'].T, sigma=sigma).T
            # append them to list
            demo_norm_temp.append({
                'alpha': time_stamp[-1],
                'left_hand': left_hand_filtered,
                'left_joints': left_joints_filtered
            })
        datasets_filtered.append(demo_norm_temp)

    # resample the datasets
    datasets_norm = []
    for task_idx, task_data in enumerate(datasets_raw):
        print('Resampling data of task: ' + task_name_list[task_idx])
        demo_norm_temp = []
        for demo_data in task_data:
            time_stamp = demo_data['stamp']
            grid = np.linspace(0, time_stamp[-1], len_norm)
            # filter the datasets
            left_hand_filtered = gaussian_filter1d(demo_data['left_hand'].T,
                                                   sigma=sigma).T
            left_joints_filtered = gaussian_filter1d(
                demo_data['left_joints'].T, sigma=sigma).T
            # normalize the datasets
            left_hand_norm = griddata(time_stamp,
                                      left_hand_filtered,
                                      grid,
                                      method='linear')
            left_joints_norm = griddata(time_stamp,
                                        left_joints_filtered,
                                        grid,
                                        method='linear')
            # append them to list
            demo_norm_temp.append({
                'alpha': time_stamp[-1],
                'left_hand': left_hand_norm,
                'left_joints': left_joints_norm
            })
        datasets_norm.append(demo_norm_temp)

    # preprocessing for the norm data
    datasets4train = []
    for task_idx, demo_list in enumerate(data_index):
        data = [datasets_norm[task_idx][i] for i in demo_list]
        datasets4train.append(data)
    y_full = np.array([]).reshape(0, num_joints)
    for task_idx, task_data in enumerate(datasets4train):
        print('Preprocessing data for task: ' + task_name_list[task_idx])
        for demo_data in task_data:
            h = np.hstack([demo_data['left_hand'], demo_data['left_joints']])
            y_full = np.vstack([y_full, h])
    min_max_scaler = preprocessing.MinMaxScaler()
    datasets_norm_full = min_max_scaler.fit_transform(y_full)
    # construct a data structure to train the model
    datasets_norm_preproc = []
    for task_idx in range(len(datasets4train)):
        datasets_temp = []
        for demo_idx in range(num_demo):
            temp = datasets_norm_full[
                (task_idx * num_demo + demo_idx) *
                len_norm:(task_idx * num_demo + demo_idx) * len_norm +
                len_norm, :]
            datasets_temp.append({
                'left_hand':
                temp[:, 0:11],
                'left_joints':
                temp[:, 11:18],
                'alpha':
                datasets4train[task_idx][demo_idx]['alpha']
            })
        datasets_norm_preproc.append(datasets_temp)

    # save all the datasets
    print('Saving the datasets as pkl ...')
    joblib.dump(task_name_list,
                os.path.join(datasets_path, 'pkl/task_name_list.pkl'))
    joblib.dump(datasets_raw,
                os.path.join(datasets_path, 'pkl/datasets_raw.pkl'))
    joblib.dump(datasets_filtered,
                os.path.join(datasets_path, 'pkl/datasets_filtered.pkl'))
    joblib.dump(datasets_norm,
                os.path.join(datasets_path, 'pkl/datasets_norm.pkl'))
    joblib.dump(datasets_norm_preproc,
                os.path.join(datasets_path, 'pkl/datasets_norm_preproc.pkl'))
    joblib.dump(min_max_scaler,
                os.path.join(datasets_path, 'pkl/min_max_scaler.pkl'))

    # the finished reminder
    print(
        'Loaded, filtered, normalized, preprocessed and saved the datasets successfully!!!'
    )
Esempio n. 2
0
                             header=0)
# put each unscaled dataset in a dataframe

array2 = dataframe2.values
array3 = dataframe3.values

#set the x-values for training set and validation set
X1 = array2[:, 0:5]
X2 = array3[:, 0:5]

# set the y-values for training set and validatio set
Y1 = array2[:, -1]
Y2 = array3[:, -1]

# set and scale using MinMaxscaler alogorithm
scaler = preprocessing.MinMaxScaler().fit(X1)
scaler = preprocessing.MinMaxScaler().fit(X2)

# initializing the scaled X values (we assigned the scaled values to arbitrarily name variables)
rescaledX1 = scaler.fit_transform(X1)
rescaledX2 = scaler.fit_transform(X2)

# We merge the y values with their respective scaled X values
Z1 = numpy.append(rescaledX1, Y1[:, None], axis=1)
Z2 = numpy.append(rescaledX2, Y2[:, None], axis=1)

# we save the scaled datasets to a desired location
numpy.savetxt("C:/Users/Kanverse/Documents/train1_scaled.csv",
              Z1,
              delimiter=",")
numpy.savetxt("C:/Users/Kanverse/Documents/validation1_scaled.csv",
Esempio n. 3
0
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

dataset = load_iris()
#导入数据
x_data, y_data = dataset.data, dataset.target.reshape(-1, 1)

print(x_data.shape)
print(y_data.shape)

#分离测试集和训练集
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    x_data, y_data, random_state=0, test_size=0.25)
scaler = preprocessing.MinMaxScaler()
#均衡数据,加速收敛
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
model = KNeighborsClassifier()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

#输出对模型的评分
print(r2_score(y_test, y_predict))
Esempio n. 4
0
    for it in range(n_kernels):
        name = methods_name[it]
        print(name)
        f1 = np.loadtxt('D:/Study/Bioinformatics/补实验/AFP/feature_matrix/' +
                        name_ds + '/' + name + '/train_' + name + '.csv',
                        delimiter=',',
                        skiprows=1)
        f3 = np.loadtxt('D:/Study/Bioinformatics/补实验/AFP/feature_matrix/' +
                        name_ds + '/' + name + '/test_' + name + '.csv',
                        delimiter=',',
                        skiprows=1)

        X_train = get_feature(f1)
        X_test = get_feature(f3)

        scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        gram_train = metrics.pairwise.rbf_kernel(X_train,
                                                 X_train,
                                                 gamma=G_list[it])
        gram_test = metrics.pairwise.rbf_kernel(X_test,
                                                X_train,
                                                gamma=G_list[it])
        kernel_train_list.append(gram_train)
        kernel_test_list.append(gram_test)

    for i in range(n_kernels):
        gram_train += kernel_train_list[i] * weight_v[i]
        gram_test += kernel_test_list[i] * weight_v[i]
Esempio n. 5
0
import numpy as np
import time
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from Test3.S_Dbw import S_Dbw

# 实验3 第(3)步骤1 使用KMeans计算

mobilePath = "../实验数据/移动客户数据表.tsv"
np.set_printoptions(precision=2, suppress=True)
min_max_scaler = preprocessing.MinMaxScaler()
x_feature = min_max_scaler.fit_transform(
    np.genfromtxt(mobilePath, skip_header=1, delimiter='\t')[:, 4:])
selector = VarianceThreshold(0)
selector.fit(x_feature)
arr = np.argsort(-selector.variances_)
row_tag = np.genfromtxt(mobilePath,
                        max_rows=1,
                        dtype=str,
                        delimiter='\t',
                        usecols=arr[:20])
x_feature = min_max_scaler.fit_transform(
    np.genfromtxt(mobilePath, skip_header=1, delimiter='\t', usecols=arr[:20]))
time_start = time.time()
clf = KMeans(n_clusters=10)
clf.fit(x_feature)
print('聚类质量SSE:', clf.inertia_)
time_end = time.time()
print('聚类运算时间 {:.2f}'.format(time_end - time_start), 's')
class TrainingInstance:
    scaler = preprocessing.MinMaxScaler()

    def __init__(self, label, emg, acc, gyr, ori, emgts=None, accts=None,
                 gyrts=None, orits=None):

        self.m_label = label
        # raw data
        self.emg = emg
        self.acc = acc
        self.gyr = gyr
        self.ori = ori

        # time stamps
        self.emgts = emgts
        self.accts = accts
        self.gyrts = gyrts
        self.orits = orits

        self.sr_emg = 200
        self.sr_other = 50

        # splitted flag
        self.splitted = False
        self.consolidated = False
        self.consolidatedFeatures = False

    def separateRawData(self):
        if self.emg is not None:
            self.emgList = np.array(
                [np.array(self.emg[:, 0]), np.array(self.emg[:, 1]),
                 np.array(self.emg[:, 2]), np.array(self.emg[:, 3]),
                 np.array(self.emg[:, 4]), np.array(self.emg[:, 5]),
                 np.array(self.emg[:, 6]), np.array(self.emg[:, 7])])

        if self.acc is not None:
            self.accList = np.array(
                [np.array(self.acc[:, 0]), np.array(self.acc[:, 1]),
                 np.array(self.acc[:, 2])])

        if self.gyr is not None:
            self.gyrList = np.array(
                [np.array(self.gyr[:, 0]), np.array(self.gyr[:, 1]),
                 np.array(self.gyr[:, 2])])

        if self.ori is not None:
            self.oriList = np.array(
                [np.array(self.ori[:, 0]), np.array(self.ori[:, 1]),
                 np.array(self.ori[:, 2]), np.array(self.ori[:, 3])])

        self.splitted = True

    # scale data
    def scaleData(self, scaler):
        if self.splitted == True:
            norm_emgs = []
            norm_accs = []
            norm_gyrs = []
            norm_oris = []

            for x in self.emgList:
                x = x.reshape(-1, 1)
                x = scaler.fit_transform(x)
                reshaped = x.reshape(x.shape[0])
                norm_emgs.append(reshaped)

            for a, b in zip(self.accList, self.gyrList):
                a = a.reshape(-1, 1)
                a = scaler.fit_transform(a)
                reshaped_a = a.reshape(a.shape[0])
                norm_accs.append(reshaped_a)
                b = b.reshape(-1, 1)
                b = scaler.fit_transform(b)
                reshaped_b = b.reshape(a.shape[0])
                norm_gyrs.append(reshaped_b)

            for x in self.oriList:
                x = x.reshape(-1, 1)
                x = scaler.fit_transform(x)
                reshaped = x.reshape(x.shape[0])
                norm_oris.append(reshaped)

            self.emgList = np.array(norm_emgs)
            self.accList = np.array(norm_accs)
            self.gyrList = np.array(norm_gyrs)
            self.oriList = np.array(norm_oris)
        return self

    # normalize data to common length
    def normalizeData(self, max_len_emg, max_len_others):
        if self.splitted == True:
            norm_emgs = []
            norm_accs = []
            norm_gyrs = []
            norm_oris = []

            for x in self.emgList:
                if (x.shape[0] == max_len_emg):
                    norm_emgs.append(x)
                    continue
                if (x.shape[0] < max_len_emg):
                    half = (float(max_len_emg - x.shape[0])) / 2
                    back = ceil(half)
                    front = floor(half)
                    norm_emgs.append(util.padVector(x, front, back, True))

            for a, b in zip(self.accList, self.gyrList):
                if (a.shape == max_len_others):
                    norm_accs.append(a)
                    norm_gyrs.append(b)
                    continue
                if (a.shape[0] < max_len_others):
                    half_a = (float(max_len_others - a.shape[0])) / 2
                    back_a = ceil(half_a)
                    front_a = floor(half_a)

                    half_b = (float(max_len_others - b.shape[0])) / 2
                    back_b = ceil(half_b)
                    front_b = floor(half_b)

                    norm_accs.append(util.padVector(a, front_a, back_a))
                    norm_gyrs.append(util.padVector(b, front_b, back_b))

            for x in self.oriList:
                if (x.shape[0] == max_len_others):
                    norm_oris.append(x)
                    continue
                if (x.shape[0] < max_len_others):
                    half = (float(max_len_others - x.shape[0])) / 2
                    back = ceil(half)
                    front = floor(half)
                    norm_oris.append(util.padVector(x, front, back))

            '''
            # Four axes, returned as a 2-d array
            f, axarr = plt.subplots(2, 2)
            axarr[0, 0].plot(np.arange(len(self.emgList[0])),self.emgList[0])
            axarr[0, 0].set_title('Raw EMG')
            axarr[0, 1].plot(np.arange(len(norm_emgs[0])),norm_emgs[0])
            axarr[0, 1].set_title('Normalized Emg')
            axarr[1, 0].plot(np.arange(len(self.accList[1])),self.accList[1])
            axarr[1, 0].set_title('Raw ACC X')
            axarr[1, 1].plot(np.arange(len(norm_accs[1])),norm_accs[1])
            axarr[1, 1].set_title('Normalized ACC X')
            plt.show()
            '''
            self.emgList = np.array(norm_emgs)
            self.accList = np.array(norm_accs)
            self.gyrList = np.array(norm_gyrs)
            self.oriList = np.array(norm_oris)
        return self

    def resampleData(self, sr, avg_len, emg=True, imu=True):
        '''
        Method for resampling the all the signals and bringing them to the
        same sampling rate
        :param sr: (int): sampling rate
        :return: self with all resampled data
        '''
        if self.splitted == True:

            # Calculate the new length of vectors given the new sampling
            # frequency/rate

            sample_len_emg = int((sr * self.emgList[0].size) / self.sr_emg)
            sample_len_emg_others = int(
                (sr * self.accList[0].size) / self.sr_other)

            self.sr_emg = sr
            self.sr_other = sr
            '''
            self.emgList_r = self.emgList
            self.accList_r = self.accList
            self.gyrList_r = self.gyrList
            self.oriList_r = self.oriList
            '''
            # resampling the normalized data
            self.emgList = np.array(
                [signal.resample(x, sample_len_emg) for x in self.emgList])
            self.accList = np.array(
                [signal.resample(x, sample_len_emg_others) for x in
                 self.accList])
            self.gyrList = np.array(
                [signal.resample(x, sample_len_emg_others) for x in
                 self.gyrList])
            self.oriList = np.array(
                [signal.resample(x, sample_len_emg_others) for x in
                 self.oriList])

            self.consolidateData(avg_len, emg, imu)
        return self

    def extractFeatures(self, window=True, scaler=None, rms=False, f_mfcc=False,
                        emg=True, imu=True):
        '''
        This method extracts features from the training instance and
        consolidates into one meature matrix according to the parameters
        provided
        :param window: (Boolean)                            : To get
        overlapping windowed features
        :param scaler: (Scaler Object as in scikit-learn)   : Scalar object
        to scale the features
        :param rms: (Boolean)                               : To extract
        features from the Root Mean Square of the signals in all dimensions
        :param f_mfcc: (Boolean)                            : To extract MFCC
        features
        :param emg: (Boolean)                               : To extract
        features from EMG signals
        :param imu: (Boolean)                               : To extract
        features from IMU signals
        :return: self
        '''
        # print(self.m_label)
        if self.splitted == True:
            # For RMS
            if rms:
                all_emg = zip(self.emgList[0], self.emgList[1], self.emgList[2],
                              self.emgList[3], self.emgList[4], self.emgList[5],
                              self.emgList[6], self.emgList[7])
                all_acc = zip(self.accList[0], self.accList[1], self.accList[2])
                all_gyr = zip(self.gyrList[0], self.gyrList[1], self.gyrList[2])
                all_ori = zip(self.oriList[0], self.oriList[1], self.oriList[2],
                              self.oriList[3])

                rms_emg = []
                rms_acc = []
                rms_gyr = []
                rms_ori = []

                # calculating RMS for all the signals
                for _0, _1, _2, _3, _4, _5, _6, _7 in all_emg:
                    vec = [_0, _1, _2, _3, _4, _5, _6, _7]
                    rms_val = sqrt(sum(n * n for n in vec) / len(vec))
                    rms_emg.append(rms_val)
                for _0, _1, _2 in all_acc:
                    vec = [_0, _1, _2]
                    rms_val = sqrt(sum(n * n for n in vec) / len(vec))
                    rms_acc.append(rms_val)
                for _0, _1, _2 in all_gyr:
                    vec = [_0, _1, _2]
                    rms_val = sqrt(sum(n * n for n in vec) / len(vec))
                    rms_gyr.append(rms_val)
                for _0, _1, _2, _3 in all_ori:
                    vec = [_0, _1, _2, _3]
                    rms_val = sqrt(sum(n * n for n in vec) / len(vec))
                    rms_ori.append(rms_val)

                # Extracting features for all the signals
                self.emgRmsFeatures = fe.getFeatures(rms_emg, self.sr_emg,
                                                     window, f_mfcc)
                self.accRmsFeatures = fe.getFeatures(rms_acc, self.sr_other,
                                                     window, f_mfcc)
                self.gyrRmsFeatures = fe.getFeatures(rms_gyr, self.sr_other,
                                                     window, f_mfcc)
                self.oriRmsFeatures = fe.getFeatures(rms_ori, self.sr_other,
                                                     window, f_mfcc)

            # for extracting features from raw data
            else:
                self.emgFeatures = np.array(
                    [fe.getFeatures(x, self.sr_emg, window, f_mfcc) for x in
                     self.emgList])
                self.accFeatures = np.array(
                    [fe.getFeatures(x, self.sr_other, window, f_mfcc) for x in
                     self.accList])
                self.gyrFeatures = np.array(
                    [fe.getFeatures(x, self.sr_other, window, f_mfcc) for x in
                     self.gyrList])
                self.oriFeatures = np.array(
                    [fe.getFeatures(x, self.sr_other, window, f_mfcc) for x in
                     self.oriList])

            self.consolidateFeatures(scaler, rms, emg, imu)
        return self

    def consolidateFeatures(self, scaler=None, rms=False, emg=True, imu=True):
        '''
        Method to consolidate the features of all the sensor data in all
        dimensions to a single feature matrix
        :param scaler: (Scaler Object)      : A scaler object to scale the
        features
        :param rms: (Boolean)               : Flag for consolidating RMS
        features
        :param emg: (Boolean)               : Flas to consider features from
        EMG signals
        :param imu: (Boolean)               : Flag to consider IMU Signals
        :return: consolidated_feature_Matrix (ndarray) : with columns as
        features and rows as overlapping window frames. If window was false
        then it just has one row.
        '''
        if self.splitted == True:
            con_emg_feat = None
            con_acc_feat = None
            con_gyr_feat = None
            con_ori_feat = None
            consolidatedFeatureMatrix = None
            if rms:
                if emg:
                    con_emg_feat = self.emgRmsFeatures
                if imu:
                    con_acc_feat = self.accRmsFeatures
                    con_gyr_feat = self.gyrRmsFeatures
                    con_ori_feat = self.oriRmsFeatures
            else:
                if emg:
                    n_emg_rows = self.emgFeatures[0].shape[0]
                    n_emg_columns = self.emgFeatures[0].shape[1]
                    new_n_emg_columns = self.emgFeatures.shape[
                                            0] * n_emg_columns
                if imu:
                    n_acc_rows = self.accFeatures[0].shape[0]
                    n_acc_columns = self.accFeatures[0].shape[1]
                    new_n_acc_columns = self.accFeatures.shape[
                                            0] * n_acc_columns

                    n_gyr_rows = self.gyrFeatures[0].shape[0]
                    n_gyr_columns = self.gyrFeatures[0].shape[1]
                    new_n_gyr_columns = self.gyrFeatures.shape[
                                            0] * n_gyr_columns

                    n_ori_rows = self.oriFeatures[0].shape[0]
                    n_ori_columns = self.oriFeatures[0].shape[1]
                    new_n_ori_columns = self.oriFeatures.shape[
                                            0] * n_ori_columns

                if emg:
                    con_emg_feat = np.reshape(self.emgFeatures,
                                              (n_emg_rows, new_n_emg_columns))
                if imu:
                    con_acc_feat = np.reshape(self.accFeatures,
                                              (n_acc_rows, new_n_acc_columns))
                    con_gyr_feat = np.reshape(self.gyrFeatures,
                                              (n_gyr_rows, new_n_gyr_columns))
                    con_ori_feat = np.reshape(self.oriFeatures,
                                              (n_ori_rows, new_n_ori_columns))
            if emg and imu:
                consolidatedFeatureMatrix = np.concatenate(
                    (con_emg_feat, con_acc_feat), axis=1)
                consolidatedFeatureMatrix = np.concatenate(
                    (consolidatedFeatureMatrix, con_gyr_feat), axis=1)
                consolidatedFeatureMatrix = np.concatenate(
                    (consolidatedFeatureMatrix, con_ori_feat), axis=1)
            elif emg and (not imu):
                consolidatedFeatureMatrix = con_emg_feat
            elif (not emg) and imu:
                consolidatedFeatureMatrix = con_acc_feat
                consolidatedFeatureMatrix = np.concatenate(
                    (consolidatedFeatureMatrix, con_gyr_feat), axis=1)
                consolidatedFeatureMatrix = np.concatenate(
                    (consolidatedFeatureMatrix, con_ori_feat), axis=1)
            else:
                return None
            '''
            consolidatedFeatureMatrix = np.concatenate((con_emg_feat,
            con_acc_feat), axis=1)
            consolidatedFeatureMatrix = np.concatenate((
            consolidatedFeatureMatrix, con_gyr_feat), axis=1)
            consolidatedFeatureMatrix = np.concatenate((
            consolidatedFeatureMatrix, con_ori_feat), axis=1)
            '''
            self.consolidatedFeatureMatrix = consolidatedFeatureMatrix
            self.consolidatedFeatures = True
            if scaler is not None:
                consolidatedFeatureMatrix = scaler.fit_transform(
                    consolidatedFeatureMatrix)
            return consolidatedFeatureMatrix
        else:
            return None

    def consolidateData(self, avg_len, emg, imu):
        consolidatedDataMatrix = None
        if self.splitted == True:
            if emg and imu:
                emg_r = np.array(
                    [signal.resample(x, avg_len) for x in self.emgList_r])
                acc_r = np.array(
                    [signal.resample(x, avg_len) for x in self.accList_r])
                gyr_r = np.array(
                    [signal.resample(x, avg_len) for x in self.gyrList_r])
                ori_r = np.array(
                    [signal.resample(x, avg_len) for x in self.oriList_r])
                consolidatedDataMatrix = np.concatenate(
                    (emg_r, acc_r, gyr_r, ori_r), axis=0)
            elif emg and (not imu):
                consolidatedDataMatrix = self.emgList
            elif (not emg) and imu:
                consolidatedDataMatrix = np.concatenate(
                    (self.accList, self.gyrList, self.oriList), axis=0)
            else:
                emg_r = np.array(
                    [signal.resample(x, avg_len) for x in self.emgList_r])
                acc_r = np.array(
                    [signal.resample(x, avg_len) for x in self.accList_r])
                gyr_r = np.array(
                    [signal.resample(x, avg_len) for x in self.gyrList_r])
                ori_r = np.array(
                    [signal.resample(x, avg_len) for x in self.oriList_r])
                consolidatedDataMatrix = np.concatenate(
                    (emg_r, acc_r, gyr_r, ori_r), axis=0)
            self.consolidatedDataMatrix = consolidatedDataMatrix.transpose()
            self.consolidated = True
            return consolidatedDataMatrix
        else:
            return None

    def getConsolidatedFeatureMatrix(self):
        if self.consolidatedFeatures:
            return self.consolidatedFeatureMatrix

    def getConsolidatedDataMatrix(self):
        if self.consolidated:
            return self.consolidatedDataMatrix

    def getRawData(self):
        return self.emg, self.acc, self.gyr, self.ori

    def getData(self):
        if self.splitted is True:
            return self.emg, self.acc, self.gyr, self.ori, self.emgList, \
                   self.accList, self.gyrList, self.oriList
        else:
            return self.emg, self.acc, self.gyr, self.ori

    def getIndevidualFeatures(self, meanNormalized=False):
        emg_0_feat = None
        emg_1_feat = None
        emg_2_feat = None
        emg_3_feat = None
        emg_4_feat = None
        emg_5_feat = None
        emg_6_feat = None
        emg_7_feat = None

        acc_x_feat = None
        acc_y_feat = None
        acc_z_feat = None

        gyr_x_feat = None
        gyr_y_feat = None
        gyr_z_feat = None

        ori_x_feat = None
        ori_y_feat = None
        ori_z_feat = None
        ori_w_feat = None

        if self.splitted and self.consolidatedFeatures:
            if meanNormalized:
                for i, feat in enumerate(self.emgFeatures):
                    if i is 0:
                        emg_0_feat = self.scaler.fit_transform(feat)
                        emg_0_feat = np.insert(emg_0_feat, len(emg_0_feat[0]),
                                               self.m_label)
                    elif i is 1:
                        emg_1_feat = self.scaler.fit_transform(feat)
                        emg_1_feat = np.insert(emg_1_feat, len(emg_1_feat[0]),
                                               self.m_label)
                    elif i is 2:
                        emg_2_feat = self.scaler.fit_transform(feat)
                        emg_2_feat = np.insert(emg_2_feat, len(emg_2_feat[0]),
                                               self.m_label)
                    elif i is 3:
                        emg_3_feat = self.scaler.fit_transform(feat)
                        emg_3_feat = np.insert(emg_3_feat, len(emg_3_feat[0]),
                                               self.m_label)
                    elif i is 4:
                        emg_4_feat = self.scaler.fit_transform(feat)
                        emg_4_feat = np.insert(emg_4_feat, len(emg_4_feat[0]),
                                               self.m_label)
                    elif i is 5:
                        emg_5_feat = self.scaler.fit_transform(feat)
                        emg_5_feat = np.insert(emg_5_feat, len(emg_5_feat[0]),
                                               self.m_label)
                    elif i is 6:
                        emg_6_feat = self.scaler.fit_transform(feat)
                        emg_6_feat = np.insert(emg_6_feat, len(emg_6_feat[0]),
                                               self.m_label)
                    elif i is 7:
                        emg_7_feat = self.scaler.fit_transform(feat)
                        emg_7_feat = np.insert(emg_7_feat, len(emg_7_feat[0]),
                                               self.m_label)
                for i, feat in enumerate(self.accFeatures):
                    if i is 0:
                        acc_x_feat = self.scaler.fit_transform(feat)
                        acc_x_feat = np.insert(acc_x_feat, len(acc_x_feat[0]),
                                               self.m_label)
                    elif i is 1:
                        acc_y_feat = self.scaler.fit_transform(feat)
                        acc_y_feat = np.insert(acc_y_feat, len(acc_y_feat[0]),
                                               self.m_label)
                    elif i is 2:
                        acc_z_feat = self.scaler.fit_transform(feat)
                        acc_z_feat = np.insert(acc_z_feat, len(acc_z_feat[0]),
                                               self.m_label)
                for i, feat in enumerate(self.gyrFeatures):
                    if i is 0:
                        gyr_x_feat = self.scaler.fit_transform(feat)
                        gyr_x_feat = np.insert(gyr_x_feat, len(gyr_x_feat[0]),
                                               self.m_label)
                    elif i is 1:
                        gyr_y_feat = self.scaler.fit_transform(feat)
                        gyr_y_feat = np.insert(gyr_y_feat, len(gyr_y_feat[0]),
                                               self.m_label)
                    elif i is 2:
                        gyr_z_feat = self.scaler.fit_transform(feat)
                        gyr_z_feat = np.insert(gyr_z_feat, len(gyr_z_feat[0]),
                                               self.m_label)
                for i, feat in enumerate(self.oriFeatures):
                    if i is 0:
                        ori_x_feat = self.scaler.fit_transform(feat)
                        ori_x_feat = np.insert(ori_x_feat, len(ori_x_feat[0]),
                                               self.m_label)
                    elif i is 1:
                        ori_y_feat = self.scaler.fit_transform(feat)
                        ori_y_feat = np.insert(ori_y_feat, len(ori_y_feat[0]),
                                               self.m_label)
                    elif i is 2:
                        ori_z_feat = self.scaler.fit_transform(feat)
                        ori_z_feat = np.insert(ori_z_feat, len(ori_z_feat[0]),
                                               self.m_label)
                    elif i is 3:
                        ori_w_feat = self.scaler.fit_transform(feat)
                        ori_w_feat = np.insert(ori_w_feat, len(ori_w_feat[0]),
                                               self.m_label)
            else:
                for i, feat in enumerate(self.emgFeatures):
                    if i is 0:
                        emg_0_feat = feat
                        emg_0_feat = np.insert(emg_0_feat, len(emg_0_feat[0]),
                                               self.m_label)
                    elif i is 1:
                        emg_1_feat = feat
                        emg_1_feat = np.insert(emg_1_feat, len(emg_1_feat[0]),
                                               self.m_label)
                    elif i is 2:
                        emg_2_feat = feat
                        emg_2_feat = np.insert(emg_2_feat, len(emg_2_feat[0]),
                                               self.m_label)
                    elif i is 3:
                        emg_3_feat = feat
                        emg_3_feat = np.insert(emg_3_feat, len(emg_3_feat[0]),
                                               self.m_label)
                    elif i is 4:
                        emg_4_feat = feat
                        emg_4_feat = np.insert(emg_4_feat, len(emg_4_feat[0]),
                                               self.m_label)
                    elif i is 5:
                        emg_5_feat = feat
                        emg_5_feat = np.insert(emg_5_feat, len(emg_5_feat[0]),
                                               self.m_label)
                    elif i is 6:
                        emg_6_feat = feat
                        emg_6_feat = np.insert(emg_6_feat, len(emg_6_feat[0]),
                                               self.m_label)
                    elif i is 7:
                        emg_7_feat = feat
                        emg_7_feat = np.insert(emg_7_feat, len(emg_7_feat[0]),
                                               self.m_label)
                for i, feat in enumerate(self.accFeatures):
                    if i is 0:
                        acc_x_feat = feat
                        acc_x_feat = np.insert(acc_x_feat, len(acc_x_feat[0]),
                                               self.m_label)
                    elif i is 1:
                        acc_y_feat = feat
                        acc_y_feat = np.insert(acc_y_feat, len(acc_y_feat[0]),
                                               self.m_label)
                    elif i is 2:
                        acc_z_feat = feat
                        acc_z_feat = np.insert(acc_z_feat, len(acc_z_feat[0]),
                                               self.m_label)
                for i, feat in enumerate(self.gyrFeatures):
                    if i is 0:
                        gyr_x_feat = feat
                        gyr_x_feat = np.insert(gyr_x_feat, len(gyr_x_feat[0]),
                                               self.m_label)
                    elif i is 1:
                        gyr_y_feat = feat
                        gyr_y_feat = np.insert(gyr_y_feat, len(gyr_y_feat[0]),
                                               self.m_label)
                    elif i is 2:
                        gyr_z_feat = feat
                        gyr_z_feat = np.insert(gyr_z_feat, len(gyr_z_feat[0]),
                                               self.m_label)
                for i, feat in enumerate(self.oriFeatures):
                    if i is 0:
                        ori_x_feat = feat
                        ori_x_feat = np.insert(ori_x_feat, len(ori_x_feat[0]),
                                               self.m_label)
                    elif i is 1:
                        ori_y_feat = feat
                        ori_y_feat = np.insert(ori_y_feat, len(ori_y_feat[0]),
                                               self.m_label)
                    elif i is 2:
                        ori_z_feat = feat
                        ori_z_feat = np.insert(ori_z_feat, len(ori_z_feat[0]),
                                               self.m_label)
                    elif i is 3:
                        ori_w_feat = feat
                        ori_w_feat = np.insert(ori_w_feat, len(ori_w_feat[0]),
                                               self.m_label)
            return emg_0_feat, emg_1_feat, emg_2_feat, emg_3_feat, emg_4_feat, emg_5_feat, emg_6_feat, emg_7_feat, acc_x_feat, acc_y_feat, acc_z_feat, gyr_x_feat, gyr_y_feat, gyr_z_feat, ori_x_feat, ori_y_feat, ori_z_feat, ori_w_feat
        else:
            return None
df_test = pd.read_csv(r'test_pub.csv')
df = pd.read_csv(r'train.csv')

df_onehot = pd.get_dummies(df)

keys = df_onehot.keys()
data_keys = [k for k in keys if '?' not in k and k[-3:] != "50K"]
data_train = df_onehot[data_keys]
target_train = df_onehot["Salary_ >50K"]

df_onehot1 = pd.get_dummies(df_test)
# add all zero to non-existing keys
for k in data_keys:
    if k not in df_onehot1.keys():
        df_onehot1[k] = 0

data_test = df_onehot1[data_keys]

sc = prep.MinMaxScaler()
data_train_s = sc.fit_transform(data_train)
data_test_s = sc.transform(data_test)

lr = LogisticRegression()
lr.fit(data_train_s, target_train)
# Predict the probability of positive class
pred_test_prob = lr.predict_proba(data_test_s)[:, 1]  #

df_test["Predicted"] = pred_test_prob
df_test[["ID", "Predicted"]].to_csv("LogisticReg_v0.csv", index=False)
Esempio n. 8
0
age_mean = df['age'].mean()  #average null age
df['age'] = df['age'].fillna(age_mean)  #fill the null
fare_mean = df['fare'].mean()  #average null fare
df['fare'] = df['fare'].fillna(fare_mean)  #fill the null
df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
x_OneHot_df = pd.get_dummies(data=df,
                             columns=["embarked"
                                      ])  #embarked classification convert
ndarray = x_OneHot_df.values  #dataframe convert array
Label = ndarray[:0]  #:=all 0=number 0 data field
Features = ndarray[:, 1:]  #:=all 1:=number 1 data field to the last

#preprocessing

from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range=(
    0, 1))  #preprocessing.MinMaxScalerSet is preprocessing Min and Max
#feature range between 0 and 1
scaledFeatures = minmax_scale.fit_transform(
    Features)  #import Features to minmax_scale.fit_transform to preprocessing
msk = numpy.random.rand(len(all_df)) < 0.8  #8:2 to msk
train_df = all_df[msk]  #train 80%
test_df = all_df[~msk]  #test 20%
'''
def PreprocessData(raw_df):
    df=raw_df.drop(['name'], axis=1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)
    x_OneHot_df = pd.get_dummies(data=df,columns=["embarked" ])
Esempio n. 9
0
def min_max_PandasNorm(df):
    x = df.values
    min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    x_norm = min_max_scaler.fit_transform(x)
    return pd.DataFrame(x_norm)
Esempio n. 10
0
def get_training_data(X_columns,
                      categorical,
                      floating,
                      integer,
                      pos_data,
                      neg_data,
                      sample_size=0.1,
                      seed=9527):
    #sythesize a training dataset from positive and negative data
    #positive dataset is smaller
    pos_clear = pos_data[pos_data['WeatherCond'] == 'Clear']
    pos_other = pos_data[pos_data['WeatherCond'] != 'Clear']
    #16000 was chosen temporarily because about 16000 crashes are associated with rainy weather,
    #and this is the about the same order of magnitutde compared to other weather conditions
    pos_clear_sub = pos_clear.sample(16000, random_state=seed)
    pos_data = pd.concat([pos_clear_sub, pos_other])

    sample_size = int(min(neg_data.shape[0], pos_data.shape[0]) * sample_size)
    pos_data['Crash'] = [1 for i in xrange(len(pos_data))]
    neg_data['Crash'] = [0 for i in xrange(len(neg_data))]

    columns = X_columns + ['Crash']
    data_df_pos = pos_data.sample(sample_size, random_state=seed)
    data_df_neg = neg_data.sample(sample_size, random_state=seed)
    data_df = pd.concat([data_df_pos[columns], data_df_neg[columns]])

    data_df[categorical] = data_df[categorical].astype(str)
    data_df[floating] = data_df[floating].astype('float64')
    data_df[integer] = data_df[integer].astype('int64')

    data_df_catagorical = data_df.select_dtypes(exclude=['float64', 'int64'])
    data_df_numerical = data_df.select_dtypes(include=['float64', 'int64'])
    # TODO one hot encode the catagoricals and start training a model
    ohe = preprocessing.OneHotEncoder(sparse=False)

    d = defaultdict(preprocessing.LabelEncoder)
    data_df_labelenc = data_df_catagorical.apply(
        lambda x: d[x.name].fit_transform(x))

    # print data_df_catagorical
    # print data_df_labelenc.values

    x_ohe = ohe.fit_transform(data_df_labelenc.values)

    x_preprocessed = np.concatenate(
        (data_df_numerical.values[:, 0:data_df_numerical.shape[1] - 1], x_ohe),
        axis=1)
    # TODO don't scale before spliting into training and test set. Added parameter that changes if the return is (x,y) or (x_train,ytrain, xtest, y test)
    # TODO change this to MaxMin scalar to avoid distorting coordinate data with a normal distribution
    # sscaler = preprocessing.StandardScaler()
    sscaler = preprocessing.MinMaxScaler()
    sscaler.fit(x_preprocessed[:, 0:data_df_numerical.shape[1] - 1])
    x_preprocessed[:, 0:data_df_numerical.shape[1] - 1] = sscaler.transform(
        x_preprocessed[:, 0:data_df_numerical.shape[1] - 1])
    y_preprocessed = data_df_numerical.values[:, -1]

    #or_x_preprocessed = sscaler.inverse_transform(x_preprocessed[:, 0:data_df_numerical.shape[1] - 1])
    #or_x_preprocessed = pd.DataFrame(or_x_preprocessed)
    #n_c=data_df_numerical.columns.values.tolist()
    #or_x_preprocessed.columns = n_c

    return x_preprocessed, y_preprocessed, ohe, d, sscaler
Esempio n. 11
0
def read(filename):
    spectrogram = pd.read_csv(filename, sep =',')
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(spectrogram)
    arr2D = x_scaled
    return arr2D
Esempio n. 12
0
def get_data_for_training_preprocessed(G,
                                       X_columns=None,
                                       Y_column=None,
                                       X_cat=None,
                                       X_int=None,
                                       X_float=None,
                                       multiple_x='newest',
                                       multiple_y='newest',
                                       verbose=False):
    samples = []
    X_c = X_columns
    for edge in G.edges_iter(data=True):
        data = edge[2]
        if Y_column in data:
            midpoint = ((edge[0][0] + edge[1][0]) / 2,
                        (edge[0][1] + edge[1][1]) / 2)

            sample = [midpoint[0], midpoint[1]]

            for column in X_columns:
                try:
                    if isinstance(data[column], list):
                        if multiple_x == 'newest':
                            sample.append(data[column][-1])
                        if multiple_x == 'sum':
                            sample.append(sum(data[column]))
                        if multiple_x == 'average':
                            sample.append(
                                sum(data[column]) / len(data[column]))
                    else:
                        sample.append(data[column])
                except:
                    sample.append(None)

            if isinstance(data[Y_column], list):
                if multiple_y == 'newest':
                    sample.append(data[Y_column][-1])
                if multiple_y == 'sum':
                    sample.append(sum(data[Y_column]))
                if multiple_y == 'average':
                    sample.append(sum(data[Y_column]) / len(data[Y_column]))
            else:
                sample.append(data[Y_column])

            samples.append(sample)

    if verbose:
        print 'done creating model training data with ' + str(
            len(samples)) + " samples"

    data_df = pd.DataFrame(samples)

    col = ['X', 'Y']
    col = col + X_c
    col.append('attribute')
    data_df.columns = col

    #data_df.to_csv('C:/Users/husiy/PyProgram/OPEN DATA NATION/Chicago_Test/test710.csv', index=False)

    cl = data_df.columns.get_values()
    # print cl
    det = []
    # dl=xrange(len(cl))
    for c in cl:
        # print sum(pd.notnull(data_df.iloc[:,c]))
        if sum(pd.notnull(data_df[c])) <= 0.8 * data_df.shape[0]:
            det.append(c)
            # dl.append(c-2)
            # for il in sorted(dl, reverse=True):
            # del X_c[il]
    # print det
    data_df = data_df.drop(det, 1)
    data_df = data_df.dropna()
    for dc in det:
        X_c.remove(dc)
        # print X_c
        # print 'dropna', data_df.shape

        # print data_df.head()
        # print data_df.dtypes

    # print data_df.head()

    if X_cat != None:
        data_df[X_cat] = data_df[X_cat].astype(str)
    if X_int != None:
        data_df[X_int] = data_df[X_int].astype('int64')
    if X_float != None:
        data_df[X_float] = data_df[X_float].astype('float64')
    print data_df.dtypes

    data_df_catagorical = data_df.select_dtypes(exclude=['float64', 'int64'])
    data_df_numerical = data_df.select_dtypes(include=['float64', 'int64'])
    # TODO one hot encode the catagoricals and start training a model
    if len(data_df_catagorical.columns) != 0:

        # TODO one hot encode the catagoricals and start training a model
        ohe = preprocessing.OneHotEncoder(sparse=False)

        d = defaultdict(preprocessing.LabelEncoder)
        data_df_labelenc = data_df_catagorical.apply(
            lambda x: d[x.name].fit_transform(x))

        # print data_df_catagorical
        # print data_df_labelenc.values

        x_ohe = ohe.fit_transform(data_df_labelenc.values)

        x_preprocessed = np.concatenate(
            (data_df_numerical.values[:, 0:data_df_numerical.shape[1] - 1],
             x_ohe),
            axis=1)
    else:
        x_preprocessed = data_df_numerical.values[:, 0:data_df_numerical.
                                                  shape[1] - 1]

    sscaler = preprocessing.MinMaxScaler()
    sscaler.fit(x_preprocessed[:, 0:data_df_numerical.shape[1] - 1])
    x_preprocessed[:, 0:data_df_numerical.shape[1] - 1] = sscaler.transform(
        x_preprocessed[:, 0:data_df_numerical.shape[1] - 1])
    y_preprocessed = data_df_numerical.values[:, -1]

    or_x_preprocessed = sscaler.inverse_transform(
        x_preprocessed[:, 0:data_df_numerical.shape[1] - 1])
    or_x_preprocessed = pd.DataFrame(or_x_preprocessed)
    n_c = data_df_numerical.columns.values.tolist()
    n_c.remove('attribute')
    or_x_preprocessed.columns = n_c

    # Saving transformations for later use
    try:
        OHE = ohe
        LabelEncoder = d
    except:
        pass

    return x_preprocessed, y_preprocessed, ohe, d, sscaler
def calibration_main(locator, config):
    # INITIALIZE TIMER
    t0 = time.clock()

    # Local variables
    building_name = config.single_calibration.building
    building_load = config.single_calibration.load
    iteration_pymc3 = config.single_calibration.iterations
    with open(locator.get_calibration_problem(building_name, building_load), 'r') as input_file:
        problem = pickle.load(input_file)
    emulator = joblib.load(locator.get_calibration_gaussian_emulator(building_name, building_load))
    distributions = problem['probabiltiy_vars']
    variables = problem['variables']

    # Create function to call predictions (mu)
    @as_op(itypes=[tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar], otypes=[tt.dvector])
    def predict_y(var1, var2, var3, var4, var5, var6):
        input_sample = np.array([var1, var2, var3, var4, var5, var6]).reshape(1, -1)
        prediction = emulator.predict(input_sample)
        return prediction

    # Create function to call predictions (sigma)
    @as_op(itypes=[tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar, tt.dscalar], otypes=[tt.dvector])
    def predict_sigma(var1, var2, var3, var4, var5, var6):
        input_sample = np.array([var1, var2, var3, var4, var5, var6]).reshape(1, -1)
        _, sigma = emulator.predict(input_sample, return_std=True)
        return sigma

    with pymc3.Model() as basic_model:

        # DECLARE PRIORS
        for i, variable in enumerate(variables):
            arguments = np.array([distributions.loc[variable, 'min'], distributions.loc[variable, 'max'],
                                  distributions.loc[variable, 'mu']]).reshape(-1, 1)
            min_max_scaler = preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1))
            arguments_norm = min_max_scaler.fit_transform(arguments)
            globals()['var' + str(i + 1)] = pymc3.Triangular('var' + str(i + 1), lower=arguments_norm[0][0],
                                                             upper=arguments_norm[1][0], c=arguments_norm[2][0])

        # DECLARE OBJECTIVE FUNCTION
        mu = pymc3.Deterministic('mu', predict_y(var1, var2, var3, var4, var5, var6))
        sigma = pymc3.HalfNormal('sigma', 0.15)
        # sigma = pm.Deterministic('sigma', predict_sigma(var1, var2, var3, var4, var5, var6))
        y_obs = pymc3.Normal('y_obs', mu=mu, sd=sigma, observed=0)

        # RUN MODEL, SAVE TO DISC AND PLOT RESULTS
        with basic_model:
            # Running
            step = pymc3.Metropolis()
            trace = pymc3.sample(iteration_pymc3, tune=1000, njobs=1, step=step)
            # Saving
            df_trace = pymc3.trace_to_dataframe(trace)

            #CREATE GRAPHS AND SAVE TO DISC
            df_trace.to_csv(locator.get_calibration_posteriors(building_name, building_load))
            pymc3.traceplot(trace)

            columns = ["var1", "var2", "var3", "var4", "var5", "var6"]
            seaborn.pairplot(df_trace[columns])

            if config.single_calibration.show_plots:
                plt.show()


    #SAVING POSTERIORS IN PROBLEM
    problem['posterior_norm'] = df_trace.as_matrix(columns=columns)
    pickle.dump(problem, open(locator.get_calibration_problem(building_name, building_load), 'w'))

    return
Esempio n. 14
0

# In[16]:


adata.raw = adata

adata = adata[:, adata.var.highly_variable]

data=adata.X


# In[17]:


mmscaler = preprocessing.MinMaxScaler()


# In[18]:


data = mmscaler.fit_transform(data)


# In[19]:


Xtarget_train, Xtarget_valid = train_test_split(data, test_size=valid_size, random_state=42)


# In[20]:
Esempio n. 15
0
url = 'C:\\Users\\Lenovo\\Desktop\\Sani\\andicatot\\data.txt'
    #########################

names = ['Date','Open','High','Low','Close','Volume','OpenInt']

dataset = pd.read_csv(url,names = names)

dataset = dataset.drop(0,axis = 0)
dataset = dataset.drop('Date',axis = 1)
dataset = dataset.drop('OpenInt',axis = 1)
for i in range(1,6):
    for k in range(1,3202):
        dataset[names[i]][k] = float(dataset[names[i]][k])
# len = 3201
# learn = [0:2731]
# test = [2731:3201]

data_normaliser = preprocessing.MinMaxScaler()
dataset = data_normaliser.fit_transform(dataset)

deltap = []
for i in range(1,3201):
    deltap.append(dataset[i-1][0] - dataset[i][0])
deltat = deltap[:2731]
for i in range(2731,3200):
    deltat.append(deltap[i-1]-deltap[i])
plt.plot(deltap,color = 'red')
plt.plot(deltat,color = 'green')
style = plt.gcf()
style.set_size_inches(12,10)
plt.show()
Esempio n. 16
0
data.features = data[["text"]]
df = pd.DataFrame(data.features)
data.features = data["text"].apply(lambda x: remove_puncs(x))
data.features = sent_tokenize(str(data.features))
data.features = word_tokenize(str(data.features))
#data.features=[word for word in data.features if word.isalpha()]

#data.features=nltk.word_tokenize(data.features)

#data.features=data.features.apply(lambda x: ' '.join([word for word in x if word not in stopwords.words()]))
#df['text']=pd.to_numeric(df["text"],errors="coerce")
#data.features=data["text"].apply(lambda x:remove_stopwords(x))
data.target = data.Label
#print(dtypes)
#data.features = SimpleImputer(missing_values=np.nan, strategy='mean')
print(data.features)
data.features = pd.get_dummies(data["text"])

data.features = preprocessing.MinMaxScaler().fit_transform(data.features)

feature_train, feature_test, target_train, target_test = train_test_split(
    data.features, data.target, test_size=0.25)

model = KNeighborsClassifier(n_neighbors=52)

fittedModel = model.fit(feature_train, target_train)
predictions = fittedModel.predict(feature_test)
predTrain = fittedModel.predict(feature_train)
print("Test:-", accuracy_score(target_test, predictions))
print("Training:-", accuracy_score(target_train, predTrain))
print(feature_train)
    training_data.append(get_tuple(results, i, n))
    training_labels.append(results["y"].values[i])


def map_range(x, in_min, in_max, out_min, out_max):
    return (x - in_min) * (out_max - out_min) / (in_max - in_min) + out_min


training_data_sigmoid = training_data.copy()
for data_point in training_data:
    for x in data_point:
        x = x

training_labels_sigmoid = training_labels.copy()

min_max_scaler = preprocessing.MinMaxScaler()
training_labels_sigmoid = min_max_scaler.fit_transform(
    results[['y']].values.astype(float))

preceptron_sigmoid.train(training_data_sigmoid,
                         training_labels_sigmoid,
                         epochs=10)

print("Trained Weights (sigmoid):", preceptron_sigmoid._weights)

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(1, 3))

plt.plot(
    results["x"].values,
    min_max_scaler.fit_transform(
        np.array([
Esempio n. 18
0
plt.rcParams['font.size'] = 15
plt.rcParams['font.family'] = 'Times New Roman'
from math import sqrt
from sklearn.metrics import mean_squared_error

np.random.seed(1337)  # for reproducibility
import warnings

warnings.filterwarnings('ignore')

data_dim = 4
timesteps = 6
out_dim = 6
dataset = pd.read_csv('multistep_feature.csv', header=None)

min_max_scaler_input = preprocessing.MinMaxScaler()  #输入标准化函数
min_max_scaler_output = preprocessing.MinMaxScaler()  #输出标准化函数

data_input = dataset.iloc[:, :24].values  #输入数据
data_output = dataset.iloc[:, 24:].values  #输出数据

trainlen = int(len(data_input) * 0.8)  #输入样本数
testlen = int(len(data_input) - trainlen)  #测试样本数

train_output = data_output[:trainlen]  #训练输出数据
test_output = data_output[trainlen:]  #测试输出数据

data_input = min_max_scaler_input.fit_transform(data_input)  #输入标准化
data_output = min_max_scaler_output.fit_transform(data_output)  #输出标准化

x_train = data_input[:trainlen].reshape(trainlen, timesteps, data_dim)  #训练输入
def normalization(data):
    data_np = data.values  #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(data_np)
    X_train_nor = pd.DataFrame(x_scaled)
    return X_train_nor
Esempio n. 20
0
    def set_police_norm(self):
        crime = Crime()
        crime_police = crime.set_police_norm()
        police = pd.pivot_table(crime_police, index='구별', aggfunc=np.sum)
        print(f'{police.head()}')
        police['살인검거율'] = (police['살인 검거'] / police['살인 발생']) * 100
        police['강간검거율'] = (police['강간 검거'] / police['강간 발생']) * 100
        police['강도검거율'] = (police['강도 검거'] / police['강도 발생']) * 100
        police['절도검거율'] = (police['절도 검거'] / police['절도 발생']) * 100
        police['폭력검거율'] = (police['폭력 검거'] / police['폭력 발생']) * 100

        police.drop(columns={'살인 검거', '강간 검거', '강도 검거', '절도 검거', '폭력 검거'},
                    axis=1)
        crime_rate_columns = ['살인검거율', '강간검거율', '강도검거율', '절도검거율', '폭력검거율']

        for i in crime_rate_columns:
            police.loc[police[i] > 100,
                       1] = 100  # 데이터값의 기간오류로 100이 넘으면 100으로 계산

        police.rename(columns={
            '살인 발생': '살인',
            '강간 발생': '강간',
            '강도 발생': '강도',
            '절도 발생': '절도',
            '폭력 발생': '폭력',
        },
                      inplace=True)
        crime_columns = ['살인', '강간', '강도', '절도', '폭력']

        x = police[crime_rate_columns].values

        min_max_scalar = preprocessing.MinMaxScaler()
        """
        스케일링은 선형변환을 적응하여
        전체 자료의 분포를 평균 0, 분산 1이 되도록 만드는 과정
        """
        x_scaled = min_max_scalar.fit_transform(x.astype(float))
        """
        정규화(normalization)
        많이 양의 데이터를 처리함에 있어 여러 이유로 정규화,
        즉 데이터의 범위를 일치시키거나
        분포를 유사하게 만들어 주는 등의 작업.
        평균값 정규화, 중간값 정규화 ..
        """
        police_norm = pd.DataFrame(x - x_scaled,
                                   columns=crime_columns,
                                   index=police.index)
        police_norm[crime_rate_columns] = police[crime_rate_columns]

        cctv = Cctv()

        cctv_pop = cctv.get_cctv_pop()
        print(f'cctv_pop : {cctv_pop.head()}')

        police_norm['범죄'] = np.sum(police_norm[crime_rate_columns], axis=1)
        police_norm['검거'] = np.sum(police_norm[crime_columns], axis=1)
        print(f'police_norm columns :: {police_norm.columns}')

        reader = self.reader
        reader.context = os.path.join(baseurl, 'saved_data')
        reader.fname = 'police_norm.csv'

        police_norm.to_csv(reader.new_file(), sep=',', encoding='utf-8')
def clean_regression_data(input_path, output_path):
    """preparing preloaded data for regression and visualizaiton
    Warning:
    This function directly calls data_clean.csv from data folder
    Do not remove this file!
    Input:
    input_path - local path for data_clean.csv
    output_path - local path for outputing
    Output:
    Cleaned data frame ready for regression analysis
    and model building
    """
    df = pd.read_csv(input_path, encoding="latin1")
    # drop unnecessary columns
    df = df.drop([
        "Unnamed: 0", "imdb_id", "Title", "X.x", "X.y", "Country", "Actors",
        "Director", "Year", "Production"
    ],
                 axis=1)
    # drop_missing values
    mis_val_col = ["Genre", "IMDB.Votes", "Runtime", "IMDB.Rating", "Language"]
    for col in mis_val_col:
        df = df.drop(df[df[col].isnull()].index)
    # budget
    df["budget"] = df["budget"].map(lambda x: math.log10(x))
    # revenue
    df["revenue"] = df["revenue"].map(lambda x: math.log10(x))
    # genre
    df = pd.concat([df, df['Genre'].str.get_dummies(sep=', ')], axis=1)
    df['Thriller'] = df[['Thriller', 'Horror']].sum(axis=1)
    df['Fantasy'] = df[['Fantasy', 'Sci-Fi']].sum(axis=1)
    df['Other_genre'] = df[[
        'Music', 'History', 'Sport', 'War', 'Western', 'Musical',
        'Documentary', 'News'
    ]].sum(axis=1)
    df.drop([
        'Music', 'History', 'Sport', 'War', 'Western', 'Musical',
        'Documentary', 'News', 'Horror', 'Sci-Fi'
    ],
            axis=1,
            inplace=True)
    genre_lst = list(df)[19:32]
    for x in genre_lst:
        df.loc[df['%s' % x] > 1, '%s' % x] = 1
    df = df.drop("Genre", axis=1)
    # IMDB.Votes
    df['IMDB.Votes'] = df['IMDB.Votes'].replace(',', '', regex=True)
    df['IMDB.Votes'] = df['IMDB.Votes'].astype(int)
    df["IMDB.Votes"] = df["IMDB.Votes"].map(lambda x: math.log10(x))
    # language
    df['Language'] = df.Language.str.count(',') + 1
    # rated
    df["Rated"] = df["Rated"].replace(np.nan, "UNRATED")\
        .replace("NOT RATED", "UNRATED")
    df = df.drop(df[(df["Rated"] == "TV-MA") | (df["Rated"] == "TV-PG") |
                    (df["Rated"] == "TV-14")].index)
    df = pd.concat([df, df['Rated'].str.get_dummies(sep=', ')], axis=1)
    # released
    # index of released date col
    index = df.columns.get_loc("Released")
    # change date data to timestamp
    release_dates = pd.to_datetime(df["Released"])
    # released date is weekend of not
    weekend_list = []
    for each in release_dates:
        day_ofweek = each.dayofweek
        if day_ofweek >= 4 and day_ofweek <= 6:
            tag = 1
        else:
            tag = 0
        weekend_list.append(tag)
    # released date is on dump months
    undumpmonth_list = []
    for each in release_dates:
        month = each.month
        if month == 12 or month == 1 or month == 2 or month == 8 or month == 9:

            tag = 0
        else:
            tag = 1
        undumpmonth_list.append(tag)
    df.insert(loc=index + 1, column="released_on_weekend", value=weekend_list)
    df.insert(loc=index + 2,
              column="released_not_on_dump_month",
              value=undumpmonth_list)
    df.drop("Released", axis=1)
    # runtime
    df["Runtime"] = df["Runtime"].map(lambda x: int(x.strip("min")))
    # normalization
    x1 = df[[
        'IMDB.Rating', 'IMDB.Votes', 'Language', 'Runtime', 'budget',
        'actor_popularity', 'director_popularity'
    ]]
    x2 = df[[
        'released_on_weekend', 'released_not_on_dump_month', 'Action',
        'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama',
        'Family', 'Fantasy', 'Mystery', 'Romance', 'Thriller', 'Other_genre',
        'G', 'NC-17', 'PG', 'PG-13', 'R', 'UNRATED'
    ]]
    y = df['revenue'].reset_index().drop("index", axis=1)
    normalizer = preprocessing.MinMaxScaler()
    x1 = normalizer.fit_transform(x1)
    x1 = pd.DataFrame(x1,
                      columns=[
                          'IMDB.Rating', 'IMDB.Votes', 'Language', 'Runtime',
                          'budget', 'actor_popularity', 'director_popularity'
                      ])
    x2 = x2.reset_index().drop("index", axis=1)
    X = pd.concat([x1, x2], axis=1)
    df_for_model = pd.concat([X, y], axis=1)
    df_for_model.to_csv(output_path, encoding="latin1")
    return df_for_model
                                 Split into training and test set
*********************************************************************************************************************
'''

X_train, X_test, y_train, y_test = cross_validation.train_test_split(boston.data, boston.target, test_size=0.2, random_state=0)


print '''
*********************************************************************************************************************
                                 Standardize / Normalize
*********************************************************************************************************************
'''

# fit the scaler on training set and apply same to test set
#scaler = preprocessing.StandardScaler().fit(X_train)
scaler = preprocessing.MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#print scaler.mean_
#print scaler.scale_
print scaler.data_min_
print scaler.data_max_

print X_train[:5,:]
print X_test[:5,:]


print '''
*********************************************************************************************************************
                                 Linear regression
Esempio n. 23
0
from django.db.models import F
from binarystars.models import InterpolatedBinaryStars
import numpy as np
from random import randint
from sklearn.cluster import KMeans, DBSCAN
from sklearn import preprocessing
import binarystars.cluster.clusteredstar as cstar

MAX_ROWS = 1001  # might have to change this to be a calculation like what is done in interpolate.py
LOWER_SEED_BOUND = 1
# 2^31 .. just using a number that is high to try and get a good amount of
UPPER_SEED_BOUND = 2147483648

DATA_PROCESSORS = {
    "minmax": preprocessing.MinMaxScaler(),
    "abs": preprocessing.MaxAbsScaler(),
    "standard": preprocessing.StandardScaler()
}


def preprocess_data(data: np.ndarray, standardizer: str) -> np.ndarray:
    return DATA_PROCESSORS[standardizer].fit_transform(data)


def get_stars(n_clusters: int = None,
              n_samples: int = None,
              eps: float = None,
              standardizer: str = None,
              cluster_type: str = None,
              attributes: dict = None,
              time_steps: int = 1,
Esempio n. 24
0
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back - look_ahead - 1):
        a = dataset[i:(i + look_back), :]
        dataX.append(a)
        dataY.append(dataset[i + look_back + look_ahead, :])
    return np.array(dataX), np.array(dataY)


sds = pickle.load(open("./GitHub_misc/sds"))
series = pickle.load(open("./GitHub_misc/series"))

N, H, W = sds.shape
gblur_size = 5
look_back = 15
look_ahead = 8
mmscaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
modelname = '360net'
model3 = models.load_model(
    './GitHub_misc/model3_{}_128_w16_h9_4000'.format(modelname))
print model3.summary()

headmap = np.array(
    [create_fixation_map(None, series, idx) for idx, _ in enumerate(series)])
headmap = np.array(
    [cv2.GaussianBlur(item, (gblur_size, gblur_size), 0) for item in headmap])
headmap = mmscaler.fit_transform(headmap.ravel().reshape(-1, 1)).reshape(
    headmap.shape)

ds = np.zeros(shape=(N, 2, H, W))
ds[:, 0, :, :] = sds
ds[:, 1, :, :] = headmap
Esempio n. 25
0
def normalizedata(datain):
    min_max_scaler = preprocessing.MinMaxScaler()
    scaledata = min_max_scaler.fit_transform(datain)
    return scaledata
Esempio n. 26
0
# 3. Plot the performance (such as error rate/accuracy)

from reservoir import onlineESNWithRLS as ESN, ReservoirTopology as topology
from plotting import OutputPlot as outputPlot
import numpy as np
import os
from datetime import datetime
from sklearn import preprocessing as pp
from reservoir import Utility as util
from performance import ErrorMetrics as rmse

# Read data from the file
data = np.loadtxt('darwin.slp.txt')

# Normalize the raw data
minMax = pp.MinMaxScaler((-1,1))
data = minMax.fit_transform(data).reshape((data.shape[0],1))

# Divide the data into training data and testing data
trainingData, testingData = util.splitData2(data, 0.8)
nTesting = testingData.shape[0]

# Form feature vectors
inputTrainingData, outputTrainingData = util.formFeatureVectors(trainingData)

# Tune the network
size = int(trainingData.shape[0]/10)
initialTransient = 100

# Input-to-reservoir fully connected
inputWeight = topology.ClassicInputTopology(inputSize=inputTrainingData.shape[1], reservoirSize=size).generateWeightMatrix()
Esempio n. 27
0
# ### Scale Continous Values

# In[41]:

# obtain scales from train set

from sklearn import preprocessing

continous = train_df[[
    'trip_distance', 'fare_amount', 'tolls_amount', 'trip_time', 'avg_speed',
    'Precipitation', 'Snow_depth', 'Snowfall', 'Max_temp', 'Min_temp',
    'Avg_wind_speed', 'Gust_speed'
]]

scaler = preprocessing.MinMaxScaler().fit(continous)
continous = scaler.transform(continous)

train_df[[
    'trip_distance', 'fare_amount', 'tolls_amount', 'trip_time', 'avg_speed',
    'Precipitation', 'Snow_depth', 'Snowfall', 'Max_temp', 'Min_temp',
    'Avg_wind_speed', 'Gust_speed'
]] = continous

# In[42]:

# apply scale to validation and test set

# validation
from sklearn import preprocessing
Esempio n. 28
0
X.drop('name', axis=1, inplace=True)
# Splice out the status column:
Y = X['status'].copy()
X.drop('status', axis=1, inplace=True)
# Perform a train/test split:
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=.3,
                                                    random_state=7)
# Program a best-parameter search:
scalers = {
    'NoScaler': False,
    'StandardScaler': preprocessing.StandardScaler(),
    'Normalizer': preprocessing.Normalizer(),
    'MaxAbsScaler': preprocessing.MaxAbsScaler(),
    'MinMaxScaler': preprocessing.MinMaxScaler(),
    'RobustScaler': preprocessing.RobustScaler()
}
best_score = 0
for sk, sv in scalers.items():
    proc = sv
    if proc:
        proc.fit(X_train)
        tX_train = proc.transform(X_train)
        tX_test = proc.transform(X_test)
    else:
        tX_train = X_train.copy()
        tX_test = X_test.copy()
    # Check dimensionality reduction? (PCA, Isomap, None)
    choice = 2
    if choice == 1:
Esempio n. 29
0
]]

# Load in the SC dataframe
pickle_in = open("Rot3_data\\SC_full_df.pkl", "rb")
Animal_SC = pickle.load(pickle_in)
animals = [
    'AA01', 'AA03', 'AA05', 'AA07', 'DO04', 'DO08', 'SC04', 'SC05', 'VP01',
    'VP07', 'VP08'
]  # AA03 and SC04 don't do any trials
sc = clean_up_sc(Animal_SC)

# ESTIMATING THE HYPERPARAMETER
# "The hyperparameter value (λ) was selected independently for each rat using evidence optimization,
# on the basis of fivefold cross-validation."

scaler = preprocessing.MinMaxScaler(
)  # from the scaler transformation the intercept turns to zero
# if does not make a difference whether it is included or not.
x = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0)

clf = LogisticRegressionCV(cv=5, random_state=0, fit_intercept=True)

# For our example what is baseline accuracy etc.
logreg = LogisticRegression(random_state=0, fit_intercept=True)
logreg.fit(x_train, y_train)  # look up what these values really mean
y_pred = logreg.predict(x_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
# Accuracy: 0.6352395672333848
def func2():
    user = {}
    for line in fileinput.input("../../data/select/select_a"):
        mac = line.strip().split(" ")[0]
        user[mac] = True
    fileinput.close()
    cnt_0, cnt_1 = 0, 0
    docMap_1, docMap_2, docMap_3, docMap_4, classMap = {}, {}, {}, {}, {}
    for line in fileinput.input(
            "../../data/feature/trace_all_statistic_filter_feature_sex"):
        part = line.strip().split(" ")
        mac, sex, feat = part[0], int(part[1]), part[2:]
        if user.has_key(mac):
            if sex == 0:
                cnt_0 += 1
            if sex == 1:
                cnt_1 += 1
            _list = []
            for f in feat:
                _list.append(float(f))
            docMap_1[mac] = _list
            classMap[mac] = sex
    fileinput.close()
    print cnt_0, cnt_1
    for line in fileinput.input(
            "../../data/feature/trace_online_statistic_filter_feature_sex"):
        part = line.strip().split(" ")
        mac, sex, feat = part[0], int(part[1]), part[2:]
        if user.has_key(mac):
            _list = []
            for f in feat:
                _list.append(float(f))
            docMap_2[mac] = _list
    fileinput.close()
    for line in fileinput.input(
            "../../data/feature/trace_http_statistic_filter_feature_sex"):
        part = line.strip().split(" ")
        mac, sex, feat = part[0], int(part[1]), part[2:]
        if user.has_key(mac):
            _list = []
            for f in feat:
                _list.append(float(f))
            docMap_3[mac] = _list
    fileinput.close()
    for line in fileinput.input("../../data/feature/keywords_normalize_sex"):
        part = line.strip().split(" ")
        mac, sex, feat = part[0], int(part[1]), part[2:]
        if user.has_key(mac):
            _list = []
            for f in feat:
                _list.append(float(f))
            docMap_4[mac] = _list
    fileinput.close()
    docList_1, docList_2, docList_3, docList_4, classList = [], [], [], [], []
    # print len(user.keys()), len(docMap_1.keys()), len(docMap_2.keys()), len(docMap_3.keys()), len(docMap_4.keys())
    for k, v in user.iteritems():
        if k in docMap_1 and k in docMap_2 and k in docMap_3 and k in docMap_4 and k in classMap:
            docList_1.append(docMap_1[k])
            docList_2.append(docMap_2[k])
            docList_3.append(docMap_3[k])
            docList_4.append(docMap_4[k])
            classList.append(classMap[k])
    docList_1, docList_2, docList_3, docList_4, classList = np.array(
        docList_1), np.array(docList_2), np.array(docList_3), np.array(
            docList_4), np.array(classList)
    min_max_scaler = preprocessing.MinMaxScaler()
    docList_1, docList_2, docList_3 = min_max_scaler.fit_transform(
        docList_1), min_max_scaler.fit_transform(
            docList_2), min_max_scaler.fit_transform(docList_3)
    cnt, errorCount = 0, 0
    loo = LeaveOneOut(len(classList))
    trainingdoc, trainingclass = [], []
    # file = open("../../data/prediction/result","w")
    for train, test in loo:
        cnt += 1
        print cnt
        trainingdoc_1, trainingdoc_2, trainingdoc_3, trainingdoc_4, trainingclass, testingdoc_1, testingdoc_2, testingdoc_3, testingdoc_4, testingclass\
         = docList_1[train], docList_2[train], docList_3[train], docList_4[train], classList[train], docList_1[test], docList_2[test], docList_3[test], docList_4[test], classList[test]
        clf_1 = pipeline.Pipeline([
            ('feature_selection',
             linear_model.LogisticRegression(penalty='l2',
                                             dual=False,
                                             tol=0.0001,
                                             C=1.0,
                                             fit_intercept=True,
                                             intercept_scaling=1,
                                             class_weight='auto',
                                             random_state=None)),
            ('classification',
             svm.SVC(kernel='linear', class_weight='auto', probability=True))
        ])
        clf_2 = pipeline.Pipeline([
            ('feature_selection',
             linear_model.LogisticRegression(penalty='l2',
                                             dual=False,
                                             tol=0.0001,
                                             C=1.0,
                                             fit_intercept=True,
                                             intercept_scaling=1,
                                             class_weight='auto',
                                             random_state=None)),
            ('classification',
             svm.SVC(kernel='linear', class_weight='auto', probability=True))
        ])
        clf_3 = pipeline.Pipeline([
            ('feature_selection',
             linear_model.LogisticRegression(penalty='l2',
                                             dual=False,
                                             tol=0.0001,
                                             C=1.0,
                                             fit_intercept=True,
                                             intercept_scaling=1,
                                             class_weight='auto',
                                             random_state=None)),
            ('classification',
             svm.SVC(kernel='linear', class_weight='auto', probability=True))
        ])
        gnb = MultinomialNB()
        clf_1.fit(trainingdoc_1, trainingclass)
        clf_2.fit(trainingdoc_2, trainingclass)
        clf_3.fit(trainingdoc_3, trainingclass)
        gnb.fit(trainingdoc_4, trainingclass)
        docList_final = []
        for one in train:
            res_1 = clf_1.predict_proba(docList_1[one])[0]
            res_2 = clf_2.predict_proba(docList_2[one])[0]
            res_3 = clf_3.predict_proba(docList_3[one])[0]
            res_4 = gnb.predict_proba(docList_4[one])[0]
            _list = [
                res_1[0], res_1[1], res_2[0], res_2[1], res_3[0], res_3[1],
                res_4[0], res_4[1]
            ]
            docList_final.append(_list)
        res_1 = clf_1.predict_proba(testingdoc_1)[0]
        res_2 = clf_2.predict_proba(testingdoc_2)[0]
        res_3 = clf_3.predict_proba(testingdoc_3)[0]
        res_4 = gnb.predict_proba(testingdoc_4)[0]
        testing_final = [
            res_1[0], res_1[1], res_2[0], res_2[1], res_3[0], res_3[1],
            res_4[0], res_4[1]
        ]
        print testing_final