def extract_features_test(self): ''' This function creates the dataset using the 'load_data' function from 'Data.Read_Data'. It operates line-by-line so should be able to handle unlimited number of lines in data files. ''' p0 = time.time() ### Calculate features one id at a time for dataset in self.datasets: load_file_path = Data_Path + '/' + dataset + '.dat' save_file_path = self.save_path + '/' + dataset + '_all_last.dat' #Delete existing print('save path: ' + save_file_path) if os.path.exists(save_file_path): os.remove(save_file_path) with open(save_file_path,'wb') as save_file: ##Create the header header = ['id'] for feature in self.features: header += [variable + '_' + feature for variable in self.variables] self.header = header pickle.dump(header,save_file) ##Calculate data for id, data in load_data(filename = load_file_path, max_row = -1): #Extract subset data = np.array(scale_df(data)[self.variables]) #Create data_point data_point = [id] for feature in self.features: feature_func = getattr(self, self.FEATURES[feature]) data_point += list(feature_func(xs = data)) #save data_point pickle.dump(data_point,save_file) #save_file.write(str(data_point[1:-1])) if (id % 5000) == 0: print('Dataset: ' + dataset + ', Line: ' + str(id) + ' out of ' + str(self.n_lines[dataset]) + ', Time: ' + str(int(time.time()-p0)) + 's')
def scale_norm_and_pca_test(self): ''' This function creates the dataset using the 'load_data' function from 'Data.Read_Data'. It operates line-by-line so should be able to handle unlimited number of lines in data files. ''' p0 = time.time() ### Calculate features one id at a time for dataset in self.datasets: load_file_path = Data_Path + '/' + dataset + '.dat' save_file_path = self.save_path + '/' + dataset + '_pca.dat' save_file_path_no_pca = self.save_path + '/' + dataset + '_no_pca.dat' #Delete existing print('save path: ' + save_file_path) if os.path.exists(save_file_path): os.remove(save_file_path) #Delete existing print('save path: ' + save_file_path_no_pca) if os.path.exists(save_file_path_no_pca): os.remove(save_file_path_no_pca) with open(save_file_path,'wb') as save_file, open(save_file_path_no_pca,'wb') as save_file_no_pca: ##Calculate data for id, data in load_data(filename = load_file_path, max_row = -1): #Scale scaled_data = scale_df(data) normalized_data = self._normalize(scaled_data) pca_data = self._pca(normalized_data) #save data_point pickle.dump((id,normalized_data),save_file_no_pca) pickle.dump((id,pca_data),save_file) if (id % 5000) == 0: print('Dataset: ' + dataset + ', Line: ' + str(id) + ' out of ' + str(self.n_lines[dataset]) + ', Time: ' + str(int(time.time()-p0)) + 's')
if __name__ == '__main__': from time import time # fn == filename # Test examples # labels = [] # start = time() # for id, label, data in load_data(filename=fn2, max_row=-1): # labels.append(label) # # print(time() - start) if True: for id, label, data in load_data(filename=fn, max_row=1): print('id={id} label={label}'.format(id=id, label=label)) test = scale_df(data) print(type(data)) print(data.columns) # print(print(type(data))) #test = data_to_keep(data, 10, list(range(6))+list(range(start=11, stop=16))) #print(test) # print(type(test)) # print('\nRecursively iterate 3') # repeat_data = repeat_iter(load_data, kwargs=dict(filename=fn, max_row=3), n_times=-1) # for _ in range(10): # id, label, data = next(repeat_data) # print('id={id} label={label}'.format(id=id, label=label)) # print(data[-1, ]) # fold1_df = load_dataframe(filename='fold1_extracted.dat')