def make_preprocessing_pandas(self, _df_csv_read_ori, _preprocessing_type , _label): """ SKLearn을 사용해서 Pandas를 Proprocessing label은 Preprocessing 하면 안됨 Args: params: * _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale'] * _df_csv_read_ori : pandas dataframe * _label Returns: Preprocessing DataFrame """ if _preprocessing_type == None or _preprocessing_type == 'null': logging.info("No Preprocessing") result_df = _df_csv_read_ori else : logging.info("Preprocessing type : {0}".format(_preprocessing_type)) numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] for i, v in _df_csv_read_ori.dtypes.iteritems(): if v in numerics: if i not in _label: #preprocessing_types = ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale'] #_preprocessing_type = ['maxabs_scale'] if 'scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.scale(_df_csv_read_ori[i].fillna(0.0)) if 'minmax_scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.minmax_scale(_df_csv_read_ori[i].fillna(0.0)) if 'robust_scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.robust_scale(_df_csv_read_ori[i].fillna(0.0)) if 'normalize' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.normalize(_df_csv_read_ori[i].fillna(0.0)) if 'maxabs_scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.maxabs_scale(_df_csv_read_ori[i].fillna(0.0)) result_df = _df_csv_read_ori return result_df
def display_network(W, n_col=None, n_row=None, transpose=False, padding=1, image_shape=None): """visualizing :param W: :param transpose: :return: """ # scale each one to [-1, 1] assert W.ndim == 2 # TODO: add other normalization behaviour W = maxabs_scale(W, axis=1) n_basis, n_pixel = W.shape if image_shape is None: image_shape = int(np.sqrt(n_pixel)), int(np.sqrt(n_pixel)) assert image_shape[0] * image_shape[1] == n_pixel if n_col is None and n_row is None: n_col = int(np.ceil(np.sqrt(n_basis))) n_row = int(np.ceil(float(n_basis) / n_col)) cell_height = image_shape[0] + 2 * padding cell_width = image_shape[1] + 2 * padding total_image = np.ones(shape=(n_row * cell_height, n_col * cell_width), dtype=np.float64) for idx, (row_idx, col_idx) in enumerate(product(range(n_row), range(n_col))): if idx >= n_basis: break position_to_plot = (slice(row_idx * cell_height + padding, row_idx * cell_height + padding + image_shape[0]), slice(col_idx * cell_width + padding, col_idx * cell_width + padding + image_shape[1])) cell_this = W[idx].reshape(image_shape) if transpose: cell_this = cell_this.T total_image[position_to_plot] = cell_this return total_image
def sparse(self, X): """ This function performs a preprocessing on features which retains the sparsity of features. A lot of the data is 0 which probably means it's missing. Experimental use only. """ return preprocessing.maxabs_scale(X)
def query(self, query, k=None, indices=None, sort=True, return_scores=False): models = self.retrieval_models weights = maxabs_scale(self.weights) # max 1 does not crash [0,1] agg_fn = self.aggregation_fn # It's important that all retrieval model return the same number of documents. all_scores = [m.query(query, k=k, indices=indices, sort=False, return_scores=True)[1] for m in models] if weights is not None: all_scores = [weight * scores for weight, scores in zip(all_scores, weights)] scores = np.vstack(all_scores) if callable(agg_fn): aggregated_scores = agg_fn(scores) else: numpy_fn = getattr(np, agg_fn) aggregated_scores = numpy_fn(scores, axis=0) # combined = aggregate_dicts(combined, agg_fn=agg_fn, sort=True) # only cut-off at k if this is the final (sorted) output ind = argtopk(aggregated_scores, k) if sort else np.arange(aggregated_scores.shape[0]) if return_scores: return ind, aggregated_scores[ind] else: return ind
def predict(self, X, do_scale=False): if do_scale: X = preprocessing.maxabs_scale(X) pred = self.active(X) if self.__output_01: for x in numpy.nditer(pred, op_flags=['readwrite']): x[...] = 1. if x[...]>=.5 else 0. return pred
def __init__(self, filePath="/home/francisco/voz",ldis=0.15,dthres=60): ''' Constructor ''' self.ldis=ldis #length discrimination rate a 15% work fine self.distThreshold=dthres #distance threshold basepath=filePath self.mfccs={} mfccAll=[] labels=os.listdir(basepath) for i,nn in enumerate(labels): n=nn.split(".")[0]#take away extension .wav (rate1,sig1)=wav.read(basepath+'/{}.wav'.format(n)) sig1=pp.maxabs_scale(sig1) #Normalize amplitude to +-1 mfcc_feat1 = mfcc(sig1,rate1) self.mfccs[n]=mfcc_feat1
def fit(self, X, y, do_scale=False, batch_size=512): if do_scale: X = preprocessing.maxabs_scale(X) batch_num = (len(X) + batch_size - 1) / batch_size last_loss = None epoch = 0 while 1: st = time.time() epoch += 1 epoch_cost = 0 for batch in range(batch_num): beg = batch * batch_size end = beg + batch_size # try to cast label. label = numpy.array(y[beg:end])#.astype(numpy.float32) if len(label.shape) == 1: label.shape = (label.shape[0], 1) epoch_cost += self.train(X[beg:end], label) dt = time.time() - st loss = epoch_cost / batch_num diff_loss = 0 if last_loss is not None: diff_loss = last_loss - loss print >> sys.stderr, 'Epoch[%d] loss : %f (diff_loss:%f, diff_time:%.3f(s))' % ( epoch, loss, diff_loss, dt) if last_loss is not None: if last_loss - loss < 1e-5: print >> sys.stderr, 'Early stop' break ''' if self.__learning_rate>=1e-3 and last_loss - loss < 1e-3: self.__learning_rate = self.__learning_rate * 0.5 print >> sys.stderr, 'Change learning rate : %f (%f)' % (self.__learning_rate, last_loss - loss) ''' last_loss = loss
def extract_features(signal,signal_label,epoch,fs,is_emg=False): fr,p = calculate_psd_and_f(signal,fs,epoch) epoch = epoch*fs #Data from EEG if (is_emg == False): max_fr = 50 ## Calculate the total power, total power per epoch, and extract the relevant frequencies. ## IMPORTANT NOTE: These are not the ACTUAL power values, they are standardized to account ## for individual variability, and are thus relative. freq = fr[(fr>=0.5) & (fr <=max_fr)] sum_power = p[:,(fr>=0.5) & (fr <=max_fr)] max_power = np.max(sum_power,axis=1) min_power = np.min(sum_power,axis=1) range_power = max_power - min_power std_power = ((sum_power.T-min_power)/range_power).T ## Calculate the relative power at the different brain waves: delta = np.sum(std_power[:,(freq>=0.5) & (freq <=4)],axis=1) thetacon = np.sum(std_power[:,(freq>=4) & (freq <=12)],axis=1) theta1 = np.sum(std_power[:,(freq>=6) & (freq <=9)],axis=1) theta2 = np.sum(std_power[:,(freq>=5.5) & (freq <=8.5)],axis=1) theta3 = np.sum(std_power[:,(freq>=7) & (freq <=10)],axis=1) beta = np.sum(std_power[:,(freq>=20) & (freq <=40)],axis=1) alpha = np.sum(std_power[:,(freq>=8) & (freq <=13)],axis=1) sigma = np.sum(std_power[:,(freq>=11) & (freq <=15)],axis=1) spindle = np.sum(std_power[:,(freq>=12) & (freq <=14)],axis=1) gamma= np.sum(std_power[:,(freq>=35) & (freq <=45)],axis=1) temp1= np.sum(std_power[:,(freq>=0.5) & (freq <=20)],axis=1) temp2= np.sum(std_power[:,(freq>=0.5) & (freq <=50)],axis=1) temp3= np.sum(std_power[:,(freq>=0.5) & (freq <=40)],axis=1) temp4= np.sum(std_power[:,(freq>=11) & (freq <=16)],axis=1) EEGrel1 = thetacon/delta; EEGrel2 = temp1/temp2; EEGrel3 = temp4/temp3; hann = np.hanning(12); spindelhan1=np.convolve(hann,EEGrel3,'same'); spindelhan=np.transpose(spindelhan1); ## Calculate the 90% spectral edge: spectral90 = 0.9*(np.sum(sum_power,axis=1)) s_edge = np.cumsum(sum_power,axis=1) l = [[n for n,j in enumerate(s_edge[row_ind,:]) if j>=spectral90[row_ind]][0] for row_ind in range(s_edge.shape[0])] spectral_edge = np.take(fr,l) # spectral edge 90%, the frequency below which power sums to 90% of the total power ## Calculate the 50% spectral mean: spectral50 = 0.5*(np.sum(sum_power,axis=1)) s_mean = np.cumsum(sum_power,axis=1) l = [[n for n,j in enumerate(s_mean[row_ind,:]) if j>=spectral50[row_ind]][0] for row_ind in range(s_mean.shape[0])] spectral_mean50 = np.take(fr,l) else: #for EMG max_fr = 100 ## Calculate the total power, total power per epoch, and extract the relevant frequencies: freq = fr[(fr>=0.5) & (fr <=max_fr)] sum_power = p[:,(fr>=0.5) & (fr <=max_fr)] max_power = np.max(sum_power,axis=1) min_power = np.min(sum_power,axis=1) range_power = max_power - min_power std_power = ((sum_power.T-min_power)/range_power).T ## Calculate the Root Mean Square of the signal signal = signal[0:p.shape[0]*epoch] s = np.reshape(signal,(p.shape[0],epoch)) rms = np.sqrt(np.mean((s)**2,axis=1)) #root mean square ## Calculate amplitude and spectral variation: amplitude = np.mean(np.abs(s),axis=1) amplitude_m=np.median(np.abs(s),axis=1) signal_var = (np.sum((np.abs(s).T - np.mean(np.abs(s),axis=1)).T**2,axis=1)/(len(s[0,:])-1)) # The variation ## Calculate skewness and kurtosis m3 = np.mean((s-np.mean(s))**3,axis=1) #3rd moment m2 = np.mean((s-np.mean(s))**2,axis=1) #2nd moment m4 = np.mean((s-np.mean(s))**4,axis=1) #4th moment skew = m3/(m2**(3/2)) # skewness of the signal, which is a measure of symmetry kurt = m4/(m2**2) #kurtosis of the signal, which is a measure of tail magnitude ## Calculate more time features signalzero=preprocessing.maxabs_scale(s,axis=1) zerocross = (np.diff(np.sign(signalzero)) != 0).sum(axis=1) maxs = np.amax(s,axis=1) mins = np.amin(s,axis=1) peaktopeak= maxs - mins arv1 = ((np.abs(s))) arv = np.sum(arv1,axis=1) arv = arv / len(s) #Energy and amplitud deltacomp = butter_bandpass_filter(s, 0.5, 4, fs, 5) #calculate energy like this deltaenergy = sum([x*2 for x in np.matrix.transpose(deltacomp)]) deltaamp = np.mean(np.abs(deltacomp),axis=1) thetacomp = butter_bandpass_filter(s, 4, 12, fs, 5) #calculate energy like this thetaenergy = sum([x*2 for x in np.matrix.transpose(thetacomp)]) thetaamp = np.mean(np.abs(thetacomp),axis=1) theta1comp = butter_bandpass_filter(s, 6, 9, fs, 5) #calculate energy like this theta1energy = sum([x*2 for x in np.matrix.transpose(theta1comp)]) theta1amp = np.mean(np.abs(theta1comp),axis=1) theta2comp = butter_bandpass_filter(s, 5.5, 8.5, fs, 5) #calculate energy like this theta2energy = sum([x*2 for x in np.matrix.transpose(theta2comp)]) theta2amp = np.mean(np.abs(theta2comp),axis=1) theta3comp = butter_bandpass_filter(s, 7, 10, fs, 5) #calculate energy like this theta3energy = sum([x*2 for x in np.matrix.transpose(theta3comp)]) theta3amp = np.mean(np.abs(theta3comp),axis=1) betacomp = butter_bandpass_filter(s, 20, 40, fs, 5) #calculate energy like this betaenergy = sum([x*2 for x in np.matrix.transpose(betacomp)]) betaamp = np.mean(np.abs(betacomp),axis=1) alfacomp = butter_bandpass_filter(s, 8, 13, fs, 5) #calculate energy like this alfaenergy = sum([x*2 for x in np.matrix.transpose(alfacomp)]) alfaamp = np.mean(np.abs(alfacomp),axis=1) sigmacomp = butter_bandpass_filter(s, 11, 15, fs, 5) #calculate energy like this sigmaenergy = sum([x*2 for x in np.matrix.transpose(sigmacomp)]) sigmaamp = np.mean(np.abs(sigmacomp),axis=1) spindlecomp = butter_bandpass_filter(s, 12, 14, fs, 5) #calculate energy like this spindleenergy = sum([x*2 for x in np.matrix.transpose(spindlecomp)]) spindleamp = np.mean(np.abs(spindlecomp),axis=1) gammacomp = butter_bandpass_filter(s, 35, 45, fs, 5) #calculate energy like this gammaenergy = sum([x*2 for x in np.matrix.transpose(gammacomp)]) gammaamp = np.mean(np.abs(gammacomp),axis=1) ## Calculate the spectral mean and the spectral entropy (essentially the spectral power distribution): spectral_mean = np.mean(std_power,axis=1) spectral_entropy = -(np.sum((std_power+0.01)*np.log(std_power+0.01),axis=1))/(np.log(len(std_power[0,:]))) ## Create a matrix of all of the features per each epoch of the signal corr_signal = signal[:len(signal)-(len(signal)%epoch)] epochs = np.arange(len(corr_signal)/epoch)+1 if (is_emg == False): feature_matrix = np.column_stack((epochs,delta,deltaenergy,deltaamp, thetacon, thetaenergy, thetaamp, theta1, theta1energy, theta1amp, theta2, theta2energy, theta2amp, theta3, theta3energy, theta3amp, beta, betaenergy, betaamp, alpha, alfaenergy, alfaamp, sigma, sigmaenergy, sigmaamp, spindle, spindleenergy, spindleamp, gamma, gammaenergy, gammaamp, EEGrel1, EEGrel2, spindelhan, spectral_edge, spectral_mean50, zerocross, maxs, peaktopeak, arv, rms, amplitude, amplitude_m, signal_var, skew, kurt, spectral_mean, spectral_entropy)) features = (['epochs','delta','deltaenergy','deltaamp','thetacon','thetaenergy','thetaamp', 'theta1','theta1energy', 'theta1amp','theta2', 'theta2energy','theta2amp', 'theta3', 'theta3energy','theta3amp', 'beta', 'betaenergy','betaamp','alpha', 'alfaenergy', 'alfaamp', 'sigma', 'sigmaenergy', 'sigmaamp', 'spindle', 'spindlenergy', 'spindleamp', 'gamma', 'gammaenergy', 'gammaamp', 'EEGrel1', 'EEGrel2', 'spindelhan', 'spectral_edge', 'spectral_mean50', 'zerocross', 'maxs' , 'peaktopeak', 'arv', 'rms', 'amplitude', 'amplitude_m', 'signal_var', 'skew', 'kurt', 'spectral_mean', 'spectral_entropy']) else: feature_matrix = np.column_stack((epochs,amplitude,signal_var,skew,kurt,rms, spectral_mean,spectral_entropy,amplitude_m)) features = (['epochs','amplitude','signal_var','skew', 'kurt','rms','spectral_mean','spectral_entropy','amplitude_m']) feature_labels = [] for i in range(len(features)): feature_labels.append('%s_%s' % (signal_label,features[i])) return feature_matrix,feature_labels
from sklearn.preprocessing import scale import sklearn.preprocessing as pp from sklearn.cluster import KMeans import cv2 import path import os basepath="/home/francisco/voz" kMns=KMeans(n_clusters=25) mfccs={} mfccAll=[] labels=os.listdir(basepath) for i,nn in enumerate(labels): n=nn.split(".")[0] (rate1,sig1)=wav.read(basepath+'/{}.wav'.format(n)) sig1=pp.maxabs_scale(sig1) mfcc_feat1 = mfcc(sig1,rate1) #mfcc_feat1=scale(mfcc_feat1)#Standarizar? #for f in mfcc_feat1: # mfccAll.append(f) mfccs[n]=mfcc_feat1 #mfccAll=np.array(mfccAll) #print "mfccAll",mfccAll.shape #mfccAll=scale(mfccAll)#whitenning,standarar #kMns.fit(mfccAll) #print kMns.predict(mfccs["paco_no_001"]) #print kMns.predict(mfccs["paco_uno_001"]) os.system("sox -r 16000 -t alsa default recording.wav silence 1 0.1 1% 1 1.5 1%") (rate2,sig2) = wav.read("recording.wav") #sig2=pp.maxabs_scale(sig2)
from sklearn.model_selection import train_test_split from sklearn import linear_model from sklearn.neighbors import KNeighborsClassifier as knn from sklearn import preprocessing import numpy as np n = 2 # 训练样本数 m = 5 # 样本维度 X = np.random.randn(n, m) * 10 # 生成随机自变量 # w = np.array([1,2,3,4,5,0.01,0.1,0.05,0.0001,6]) # # w = np.arange(1,11,1).T # 设置线性系数 # Y = np.dot(X,w) + np.random.randn(n)/10 # 经过线性系数加权并加噪 # Y[Y>0] = 1 # Y[Y<=0] = 0 # X1 = preprocessing.normalize(X,norm='l2') X2 = preprocessing.maxabs_scale(X, axis=1) # print(X,'\n',X2) X3 = preprocessing.normalize(X, axis=0) print(X, '\n', X3) # n = 200 # 训练样本数 # m = 10 # 样本维度 # X = np.random.randn(n,m)*10 # 生成随机自变量 # w = np.array([1,2,3,4,5,0.01,0.1,0.05,0.0001,6]) # # w = np.arange(1,11,1).T # 设置线性系数 # Y = np.dot(X,w) + np.random.randn(n)/10 # 经过线性系数加权并加噪 # Y[Y>0] = 1 # Y[Y<=0] = 0 # Y1 = np.dot(X,w) + np.random.randn(n)/10 # model = linear_model.LinearRegression()
pass #return somethin terminate this function else: return df[st_index:st_index + win_size] # ================================= Main Function ========================== if __name__ == '__main__': file_path = r'D:\Project files\input_ml.csv' df = pd.read_csv(file_path, index_col='Date') df = df.fillna(0) # Replace NaN value with 0 df['Signal'] = df['Signal'].astype(int) # convert to int object # test gen ml model with python y = df['Signal'] X = df.drop('Signal', axis=1) X = preprocessing.maxabs_scale(X) #Scale each feature to the [-1, 1] range X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=False) clf = LogisticRegression() clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) #Test model with test data # set role and normalize #pipeline = make_pipeline(preprocessing.Normalizer(), # LogisticRegression()) #df = df.set_index(pd.to_datetime(df['Date'], format='%d-%m-%Y'))
def read(self, filename): self.__X = [] self.__Y = [] self.__info = [] first_row = True fd = file(filename) progress = pydev.FileProgress(fd, filename) raw_X = [] for row in pydev.foreach_row(fd, seperator=self.__seperator): progress.check_progress() # whether to ignore first row. if first_row and self.__ignore_first_row: first_row = False continue # check column count. if self.__expect_column_count < 0: self.__expect_column_count = len(row) if self.__target_column < 0: self.__target_column = self.__expect_column_count - 1 print >> sys.stderr, 'columns set to %d, target:%d' % (self.__expect_column_count, self.__target_column) elif len(row) != self.__expect_column_count: continue # strip each columns. row = map(lambda x:x.strip(), row) # get x dict. id_value = [] v_size = 0 ignored_info = [] for rid, value in enumerate(row): # continue if target columns. if rid == self.__target_column: continue # continue if filter columns. if rid in self.__ignore_columns: ignored_info.append(value) continue # dense and id-value-sparse if self.__row_mode == DataReader.DenseValue: cid = rid elif self.__row_mode == DataReader.IVSparse: cid, value = value.split(':') cid = int(cid) if cid in self.__concrete_ids: # one-hot representation for key. # feature = id-value : 1 fid, value = self.__feature_trans.allocate_id('#%03d:%s' % (cid, value)), 1 else: # feature = id : value fid, value = self.__feature_trans.allocate_id('#%03d' % (cid)), float(value) id_value.append( (fid, value) ) if v_size < fid+1: v_size = fid+1 x = numpy.ndarray(shape=(v_size,)) x.fill(0) for fid, value in id_value: x[fid] = float(value) raw_X.append(x) # get Y if self.__concrete_target: row[self.__target_column] = self.__target_trans.allocate_id( row[self.__target_column] ) y = row[self.__target_column] self.__Y.append(y) self.__info.append( self.__seperator.join(ignored_info) ) progress.end_progress() # resize for each X. x_size = self.__feature_trans.size() for x in raw_X: new_x = numpy.ndarray(shape=(x_size,), dtype=numpy.float32) new_x.fill(0) new_x[:x.shape[0]] = x self.__X.append( new_x ) # resize Y if concrete label. if self.__target_one_hot: raw_Y = self.__Y self.__Y = [] y_size = self.__target_trans.size() for y in raw_Y: new_y = numpy.ndarray(shape=(y_size,), dtype=numpy.float32) new_y.fill(0) new_y[ int(y) ] = 1. self.__Y.append( new_y ) # transform X to numpy.ndarray self.__X = numpy.array(self.__X) # preprocessing. if self.__maxabs_scale: print >> sys.stderr, 'Do maxabs_scale' self.__X = preprocessing.maxabs_scale(self.__X) # make Y as ndarray self.__Y = numpy.array(self.__Y).astype(numpy.float32) #self.__feature_trans.debug() #self.__target_trans.debug() print >> sys.stderr, 'Data load [ %d(records) x %d(features) ]' % (len(self.__X), len(self.__X[0]))
def normalize_features(features): from sklearn.preprocessing import maxabs_scale return maxabs_scale(features).reshape(1, -1)[0]
def recognize(self, fileName): (rate2, sig2) = wav.read(fileName) sig2 = pp.maxabs_scale(sig2) #Normalize amplitude to +-1 mfcc_feat2 = mfcc(sig2, rate2) return self.get1NN(mfcc_feat2)
df = pd.DataFrame(data = temp, columns = ['Time', 'Data']) #data frame (using pandas) ########## preprocess data print "Preprocessing data..." if value_type in {"accelerometer", "magnetometer"}: df2 = df[df['Data']!=0] elif value_type == "temperature": df2 = df[df['Data']<70] elif value_type == "humidity": c = Counter(df.Data) value_to_remove = c.most_common(1)[0][0] #value that appears the most frequently df2 = df[df['Data']!= value_to_remove] maxabs_scale(df2.Data, copy = False) else: print "invalid value type" print "Data Preprocessed" ########## initialize for featurizing time_window_start = time_reference feature_list = [] # number of data points n = df2.shape[0] # counter, to determine how many data points from the file have been processed (so that we know if we should stop or not) count = 0 ######### create feature for each time window
def generateDataset(target, restrictions, input_labels, labels_to_categorize, table, testset_size, label): global X_train, X_test, y_train, y_test, input_vectors, input_pieces, targets, scalename, currentName, currentPrediction, currentSQL, currentAUC, currentSensitivity, currentSpecificity, currentDatasetSize, currentTrainingsetSize, currentTestingsetSize, currentScaling, currentCrossvalidation, lastid, currentAUCData, pickle currentPrediction = label #This part is somewhat research-spesific and needs to be altered, as different models are trained. #Construct the SQL query: SQL = 'SELECT ' + target[0] + ', ' + ",".join(map( str, input_labels)) + ' FROM ' + table + ' WHERE ' + target[ 1] + ' IS NOT NULL AND ' + ' IS NOT NULL AND '.join( map(str, input_labels)) + " IS NOT NULL AND " + restrictions #Set SQL string as global current and fetch the data currentSQL = SQL cur.execute(SQL) dataset = cur.fetchall() #Create dictionary for categorical arrays: categorical_input_arrays = {} for label in categorical_inputs: categorical_input_arrays[label] = [] #Use first item in SQL rows as targets, append the rest either to array to be processed or to be used straightly for i in dataset: # add firts column to targets targets.append(i[0]) #add categorical columns to dictionary for j in xrange(0, len(categorical_inputs)): categorical_input_arrays[categorical_inputs[j]].append(i[j + 1]) #add rest i.e. numericals and booleans to input_vectors input_vectors.append(i[len(categorical_inputs) + 1:]) #Construct label arrays for human-readable decision trees: input_pieces.extend(input_labels[len(categorical_inputs):]) #loop through categoricals, get distinctive labels from SQL, fit and use label encoder and add result to input_vectors for label in list(categorical_input_arrays): cur.execute('SELECT DISTINCT ' + label + ' FROM ' + table + ' ') label_set = [] for l in cur.fetchall(): label_set.append(l[0]) #fit labelencoder, construct input_pieces, append encoded vector to input vectors: le.fit(label_set) for cl in le.classes_: input_pieces.extend([label + ': ' + cl]) to_append = le.transform(categorical_input_arrays[label]) for k in range(0, len(to_append)): input_vectors[k] = np.hstack( (np.array(input_vectors[k]), np.array(to_append[k]))) #Constructing the main sets: training features and targets, testing features and targets. input_vectors = np.array(input_vectors) #Scale the input vectors: if scalename == 'scale': input_vectors = preprocessing.scale(input_vectors) elif scalename == 'maxabs': input_vectors = preprocessing.maxabs_scale(input_vectors) targets = np.array(targets) X_train, X_test, y_train, y_test = train_test_split(input_vectors, targets, test_size=testset_size, random_state=42, stratify=targets) dataset_json = {} dataset_json['training_input'] = X_train.tolist() dataset_json['training_output'] = y_train.tolist() dataset_json['testing_input'] = X_test.tolist() dataset_json['testing_output'] = y_test.tolist() json_dump = json.dumps(dataset_json) json_hash = hashlib.sha224(json_dump).hexdigest() #check if identical dataset is already stored: cur.execute("SELECT id FROM datasets WHERE hash like '" + json_hash + "'") res = cur.fetchall() store_dataset = False try: dataset_id = res[0][0] except: store_dataset = True #if no identical dataset, select max id and add one to that: q = 'SELECT max(id) FROM datasets' cur.execute(q) res2 = cur.fetchall() try: dataset_id = res2[0][0] + 1 except: dataset_id = 1 if (store_dataset): q = 'INSERT INTO datasets (id, hash, dataset) VALUES (%s, %s, %s)' cur.execute(q, (dataset_id, json_hash, json_dump)) conn.commit() if (printing): f.printDatasetInfo(label, len(dataset), 0.33, len(y_train), len(y_test), scalename, SQL) #update the current dataset sizes currentTrainingsetSize, currentTestingsetSize = len(y_train), len(y_test)
import matplotlib.pylab as plt import pandas as pd import numpy as np #전처리를 지원하는 패키지 #R에서는 scale = z점수 from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale print((np.arange(10, dtype=np.float) - 3)) x = (np.arange(10, dtype=np.float) - 3).reshape(-1, 1) #행과 열 print(x) df = pd.DataFrame(np.hstack( [x, scale(x), robust_scale(x), minmax_scale(x), maxabs_scale(x)]), columns=[ 'x', 'scale(x)', 'robust_scale(x)', 'minmax_scale(x)', 'maxabs_scale(x)' ]) df #분포의 변화는 불변 import seaborn as sns from sklearn.datasets import load_iris iris = load_iris() data1 = iris.data data2 = scale(iris.data)
def forest(self, filename): self.logger.info('Starting random forest analysis.') df = RandomForest.get_matrix(self) self.logger.info('Setting up variables.') x = np.array(df.drop([self.variable], axis=1)) x = preprocessing.maxabs_scale(x) y = np.array(df[self.variable]) c = Counter(y) c = dict(c) sample_count = 0 for _, value in c.items(): sample_count += value print(f"There appear to be a total of {sample_count} samples.") for category, value in c.items(): print(" %s samples labeled at %s, or %2.1f%s of the total" % (value, category, value/sample_count*100, "%")) # Setup the different classifiers. Decision tree and a 'forest' of tress are obvious choices. # Also included are two type of boosting. dt = DecisionTreeClassifier() rf = RandomForestClassifier(n_estimators=100, max_features="auto", random_state=33) ab = AdaBoostClassifier(n_estimators=100, random_state=33) gb = GradientBoostingClassifier(n_estimators=100, random_state=33) # We will also test the two main types of Cross Validation sf = StratifiedKFold(n_splits=10, random_state=33, shuffle=True) tts = train_test_split(x, y, test_size=.25, shuffle=True) # Decision Trees, TTS and then combined results of samples folds self.logger.info("Running Decision Tree Analysis.") x_train, x_test, y_train, y_test = tts dt.fit(x_train, y_train) y_predict_tts = dt.predict(x_test) print("Decision Tree:\n\tTrain/Test Split Accuracy: %2.1f%s" % (accuracy_score(y_test, y_predict_tts) * 100, "%")) i = 0 score = 0 for train, test in sf.split(x, y): dt.fit(x[train], y[train]) y_predict = dt.predict(x[test]) score += accuracy_score(y[test], y_predict) i += 1 print("\tSample Folds Accuracy: %2.1f%s" % (score / i * 100, "%")) self.logger.info("Running Random Forest Analysis") # Random Forest, TTS and then combined results of samples folds # We don't need to generate the train/test data again. rf.fit(x_train, y_train) y_predict_tts = rf.predict(x_test) print("Random Forest:\n\tTrain/Test Split Accuracy: %2.1f%s" % (accuracy_score(y_test, y_predict_tts) * 100, "%")) scores = cross_val_score(rf, x, y, cv=10) print("\t10X (Stratified)KFold Accuracy: %0.2f%s (+/- %0.2f)" % (scores.mean() * 100, "%", scores.std() * 200)) # AdaBoost, TTS and then combined results of samples folds self.logger.info("Running AdaBoost Analysis") ab.fit(x_train, y_train) y_predict_tts = ab.predict(x_test) print("AdaBoost:\n\tTrain/Test Split Accuracy: %2.1f%s" % (accuracy_score(y_test, y_predict_tts) * 100, "%")) i = 0 score = 0 for train, test in sf.split(x, y): ab.fit(x[train], y[train]) y_predict = ab.predict(x[test]) score += accuracy_score(y[test], y_predict) i += 1 print("\tSample Folds Accuracy: %2.1f%s" % (score / i * 100, "%")) # Gradient Boost, TTS and then combined results of samples folds self.logger.info("Running Gradient Boost Analysis") ab.fit(x_train, y_train) y_predict_tts = ab.predict(x_test) print("Gradient Boost:\n\tTrain/Test Split Accuracy: %2.1f%s" % (accuracy_score(y_test, y_predict_tts) * 100, "%")) i = 0 score = 0 for train, test in sf.split(x, y): gb.fit(x[train], y[train]) y_predict = gb.predict(x[test]) score += accuracy_score(y[test], y_predict) i += 1 print("\tSample Folds Accuracy: %2.1f%s" % (score / i * 100, "%\n")) # Here is the output if features in the RF classifier. Just for vanilla RF. # Screen output is limited to the top 100 features. importance = rf.feature_importances_ indices = np.argsort(importance)[::-1] features = df.drop([self.variable], axis=1).columns[indices] standard_deviations = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0) print("%s informative features out of %s total..." % (np.count_nonzero(importance), importance.shape[0])) df_complete_list = pd.DataFrame(zip(features, importance[indices], standard_deviations[indices]), columns=('features', 'importance', 'std')) output_file = self.outdir+'/'+filename df_complete_list.to_csv(path_or_buf=output_file, sep=',') if np.count_nonzero(importance) > 100: features_to_print = 100 print("Showing the top 100! Full list can be found in csv file.") else: features_to_print = np.count_nonzero(importance) print("Showing them all. Also writing to csv file.") for f in range(features_to_print): print("%d. feature %s (%f)" % ( f + 1, features[f], importance[indices[f]])) plt.figure() plt.title(f"Random Forest Feature Importance: {self.variable}", fontsize=12) plt.bar(range(features_to_print), importance[indices][:features_to_print], color="r", yerr=standard_deviations[indices][:features_to_print]) plt.xticks(range(features_to_print), features, rotation=90, fontsize=3) plt.xlim([-1, features_to_print]) # plt.show() plt.savefig(self.outdir+"/"+self.variable+'_rfFeatures.pdf', format='pdf', dpi=150)
def maxabsScale(data): return maxabs_scale(data)
from pyriemann.estimation import Covariances from mne import Epochs, pick_types, find_events from mne.channels import read_layout from mne.io import concatenate_raws, read_raw_edf from mne.datasets import eegbci from mne.decoding import CSP from datetime import datetime from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale import pickle with open('C:/Users/dk/PycharmProjects/giga_cnn/convert/smt100_ica_2.pkl', 'rb') as f: x_data = pickle.load(f) scaler = StandardScaler() i=0 raw_data_train = np.zeros_like(x_data) for i in range(x_data.shape[0]): raw_fit = maxabs_scale(x_data[i, :, :]) raw_data_train[i,:,:] = raw_fit[:,:] print(i) with open('C:/Users/dk/PycharmProjects/giga_cnn/convert/smt100_ica_maxabs_2.pkl', 'wb') as f: pickle.dump(raw_data_train, f)
os.makedirs(training_dataset) data = pd.read_csv('bd2019-weather-prediction-training-20190608.csv') print(data.isnull().sum()) data.fillna("00,", inplace=True) data['date'] = pd.to_datetime(data['date']) data = data[~data['rain20'].isin([999990])] data = data[~data['rain08'].isin([999990])] data = data[~data['wind_speed'].isin([999999])] data = data[~data['wind_direction'].isin([999999])] data = data[~data['visibility'].isin([999999])] data = data[~data['temperature'].isin([999999])] data = data[~data['humidity'].isin([999999])] data['cloud'] = preprocessing.maxabs_scale(data['cloud']) data['wind_direction'].replace(999001, 0, inplace=True) data['wind_direction'].replace(999002, 22.5, inplace=True) data['wind_direction'].replace(999003, 45, inplace=True) data['wind_direction'].replace(999004, 67.5, inplace=True) data['wind_direction'].replace(999005, 90, inplace=True) data['wind_direction'].replace(999006, 112.5, inplace=True) data['wind_direction'].replace(999007, 135, inplace=True) data['wind_direction'].replace(999008, 157.5, inplace=True) data['wind_direction'].replace(999009, 180, inplace=True) data['wind_direction'].replace(999010, 202.5, inplace=True) data['wind_direction'].replace(999011, 225, inplace=True) data['wind_direction'].replace(999012, 247.5, inplace=True) data['wind_direction'].replace(999013, 270, inplace=True) data['wind_direction'].replace(999014, 292.5, inplace=True) data['wind_direction'].replace(999015, 315, inplace=True)
def run_pca(expression, annotation, powerlaw): tissue_data, description_data = read_annotation(annotation) # Load Expression data df = pd.read_table(expression, header=0, index_col=0) run_ids = list(df.columns.values) dataMatrix = np.transpose(np.array(df)) # Run PCA sklearn_pca = sklearnPCA(n_components=2) sklearn_transf = sklearn_pca.fit_transform( preprocessing.maxabs_scale(dataMatrix, axis=0)) # Tissues and color table tissues = [tissue_data[r.replace('.htseq', '')] for r in run_ids] colors = { 'leaf': 'green', 'root': 'brown', 'shoot': 'blue', 'plant': 'black', 'seed': 'red', 'flower': 'cyan', 'stem': 'yellow', 'seedling': 'white', 'pollen': 'violet' } found_tissues = {} plt.figure(1) with sns.axes_style("whitegrid", {"grid.linestyle": None}): plt.subplot(121) for run, tissue, pca_data in zip(run_ids, tissues, sklearn_transf): label = tissue if tissue in colors.keys() else 'other' plt.plot( pca_data[0], pca_data[1], 'o', markersize=7, color=colors[tissue] if tissue in colors.keys() else 'gray', alpha=0.5, label=label if label not in found_tissues.keys() else "_nolegend_") found_tissues[label] = True plt.xlabel('PC 1 (%0.2f %%)' % (sklearn_pca.explained_variance_ratio_[0] * 100)) plt.ylabel('PC 2 (%0.2f %%)' % (sklearn_pca.explained_variance_ratio_[1] * 100)) plt.legend() plt.draw() with sns.axes_style("whitegrid"): plt.subplot(122) df = pd.read_table(powerlaw, names=['Node degree', 'Gene count']) ax = sns.regplot(x='Node degree', y='Gene count', data=df, fit_reg=False) ax.set(xlim=(1, 10000), ylim=(1, 10000), xscale='log', yscale='log') plt.show()
# Split the dataset into train and test and **organize** it as necessary to work with our model. # In[3]: # digits.data stores flattened ndarray size 64 from 8x8 images. X, Y = digits.data, digits.target # Split dataset into 80% train images and 20% test images X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True) # normalize the input values by scaling each feature by its maximum absolute value X_train = preprocessing.maxabs_scale(X_train) X_test = preprocessing.maxabs_scale(X_test) # ## Training and Saving the Model # Let's **train our model** and **save the training model to a file**: # In[4]: # Create a classifier: a support vector classifier model = svm.SVC(gamma=0.001, C=100) # Learn the digits on the train subset model.fit(X_train, Y_train) # Save the model to a file
if __name__ == '__main__': ''' import visdom from Data_generator_normalize import data_generate generator = data_generate() vis = visdom.Visdom(env='yancy_env') test_set, label = generator.SQ_data_generator(train=False, examples=20, normalize=True) data = np.squeeze(test_set, axis=-1).reshape([-1, test_set.shape[-2]]) # (n, 2048) label = label.reshape([-1]) # (n,) cls = 3 # try vis_tSNE vis_tSNE(data, label, cls, vis, name='test') # try t_sne fig = t_sne(data, label, classes=cls, name='ta') plt.show() ''' a0 = [[1, 2, 1], [1, 3, 4], [5, 8, -10]] a = np.array(a0) b = maxabs_scale(a.astype(np.float), axis=1) c = my_normalization1(a) d = my_normalization2(a) e = my_normalization3(a) print(b) print(c) print(d) print(e)
def handle_fare(data): new_data = data new_data["Fare"] = fill_na_with_mean(new_data, "Fare") new_data["Fare"] = new_data["Fare"]/ 20 new_data["Fare"] = preprocessing.maxabs_scale(data["Fare"]) return new_data
modelFeatures.append({ "active": 1, "feature": "f_pos", "args": [], "kwargs": { "table": "models/table" } }) clf = OneVsRestClassifier(svm.LinearSVC(random_state=0)) #svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) # load data from all active features examples = loadData(args.dataref) # print counts of candidate sets # grouped by length if False: counts = getLengthCounts(examples) print counts sys.exit(0) # get permuted features if requested if args.permute: totalX, totalY = permuteFeatures(examples, modelFeatures, length=args.permute) # otherwise get features in a regular format else: totalX, totalY = combineFeatures(examples, modelFeatures, verbose=True) # scaling data print "Scaling data..." totalX = preprocessing.maxabs_scale(totalX) # run kfold validation print "----------------------------------" print "Running kfold validation on all features" print "----------------------------------" kfoldValidation(10, np.array(totalX), np.array(totalY), classifier=clf, verbose=True)
b = binarizer(allValues) # print(f"working with classes {b.ref}") for i in range(len(allData)): # encoded = lb.transform([allValues[i]]) # normData[i] = np.concatenate((normData[i],encoded[0])) encoded = b.transform(allValues[i]) normData[i] = np.concatenate((normData[i], encoded)) # for i in range(len(allData)): # normData[i].append() elif value == "number": # Minor bug - if inf found, we replace with max to norm to 1. maxVal = np.where(np.isinf(allValues), -np.Inf, allValues).argmax() for i in range(len(allValues)): if (allValues[i] == np.float("inf")): allValues[i] = maxVal newValues = preprocessing.maxabs_scale(allValues) for i in range(len(allData)): normData[i] = np.concatenate( (normData[i], np.array([newValues[i]]))) else: raise Exception("what did you just hand me?") newcollen = len(normData[0]) totalCols += newcollen - oldcollen print( f"Normalized {key} considered a {value} by adding {newcollen-oldcollen} cols" ) print(f"Total cols added per item is {totalCols}") print(len(normData)) print(normData[0].shape) # Gotta Tensor-ify everything, if possible.
def recognize(self,fileName): (rate2,sig2) = wav.read(fileName) sig2=pp.maxabs_scale(sig2) #Normalize amplitude to +-1 mfcc_feat2 = mfcc(sig2,rate2) return self.get1NN(mfcc_feat2)
def maxAbsScale(y, axis=0): """normalises array dividing by the max abs value""" return maxabs_scale(y, axis=axis)
#%% loading data folder='C:/Users/Dan/Documents/MATLAB' os.chdir(folder) data=scipy.io.loadmat('S09_fft.mat') data=data['S09_fft'] data=np.transpose(data.reshape(64*40,80*16,order='F')) #data=scipy.io.loadmat('S09_fft_2D.mat') #data=data['data'] #dataM=np.transpose(data) labels=np.tile(np.arange(1,81),16) #%% scaling from sklearn.preprocessing import maxabs_scale data = maxabs_scale(data) #%% SVM from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score, mean_squared_error from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier from sklearn.feature_selection import chi2, SelectKBest from sklearn.svm import SVC import itertools start = time.time() chi_k = 'all' scores_all=dict() scores=list() for i in itertools.combinations(np.arange(1,81), 2): print(i)
def init_disp_conv(cp, dataset): # make_disp function could be used for the creation of symoblic var of the # input, yet we had issues in past experiments with the process of stacking layers # and initializing their parameters on individual functions, therefore we decided on # initializing every aspect of each network in the same function. # Retrieve all dispersions from "test" dataset dataset = dataset[:, 3] # Get parameters featurex = int(cp.get('Dispersion', 'feature_x')) featurey = int(cp.get('Dispersion', 'feature_y')) channels = int(cp.get('Dispersion', 'channels')) pool_size = int(cp.get('D1', 'pool')) def_filters = int(cp.get('Default', 'filters')) def_filter_size = int(cp.get('Default', 'filter_size')) def_stride = int(cp.get('Default', 'stride')) def_padding = int(cp.get('Default', 'padding')) # Resize each dispersion, the dispersion are treated as image dataset = [scipy.misc.imresize(x, (featurex, featurey)) for x in dataset] dataset = np.array(dataset) # Reshape to 2D dataset = dataset.reshape(dataset.shape[0], featurex * featurey) # Scale dataset = maxabs_scale(dataset) # Reshape to conv compatible shape dataset = dataset.reshape(dataset.shape[0], channels, featurex, featurey) input_var = theano.shared(name='input_var', value=np.asarray(dataset, dtype=theano.config.floatX), borrow=True) # Layer stacking (Dispersion subnetwork #1) # ------------------------------------------------------------------------ # Input layer input_layer = network = lasagne.layers.InputLayer(shape=(None, channels, featurex, featurey), input_var=input_var) # Introduce padding pad = network = lasagne.layers.PadLayer(incoming=network, width=((0, 1), (1, 0))) # Convolutional layer #1 network = lasagne.layers.Conv2DLayer(incoming=network, num_filters=def_filters, filter_size=(def_filter_size, def_filter_size), stride=def_stride, pad=def_padding) # Convolutional layer #2 (by default) network = lasagne.layers.Conv2DLayer( incoming=network, num_filters=int(cp.get('D1', 'convfilters')), filter_size=(int(cp.get('D1', 'filtersize')), int(cp.get('D1', 'filtersize'))), stride=int(cp.get('D1', 'stride')), pad=int(cp.get('D1', 'pad'))) # Maxpool network = lasagne.layers.MaxPool2DLayer(incoming=network, pool_size=(pool_size, pool_size)) # End of layer stacking for subnetwork #1 # ######################################################################## # Layer stacking (Dispersion subnetwork #2) # ------------------------------------------------------------------------ # Subnetwork #1 till padding network2 = pad # Convolutional layer #1 (different params) network2 = lasagne.layers.Conv2DLayer(incoming=network2, num_filters=def_filters, filter_size=(def_filter_size, def_filter_size), stride=def_stride, pad=def_padding) # Convolutional layer #2 (different params) network2 = lasagne.layers.Conv2DLayer( incoming=network2, num_filters=int(cp.get('D2', 'convfilters')), filter_size=(int(cp.get('D2', 'filtersize')), int(cp.get('D2', 'filtersize'))), stride=int(cp.get('D2', 'stride')), pad=int(cp.get('D2', 'pad'))) # Maxpool network2 = lasagne.layers.MaxPool2DLayer(incoming=network2, pool_size=(pool_size, pool_size)) # Flatten both subnetworks network = lasagne.layers.FlattenLayer(network) network2 = lasagne.layers.FlattenLayer(network2) # Connect subnetworks out = lasagne.layers.ConcatLayer(incomings=(network, network2), axis=1) # log('Printing Dispersion Net Structure.......') # log(lasagne.layers.get_output_shape(lasagne.layers.get_all_layers(network))) return [input_layer, input_var, out]
from oml.models.regularizers import L1, L2Sq from oml.optimizers.sgd import Fobos from oml.optimizers.adagrad import AdaGrad, AdaRDA from oml.optimizers.rda import Rda, AcceleratedRDA from oml.optimizers.vr import Svrg from oml.optimizers.freerex import FreeRex from oml.optimizers.adam import Adam, AdMax from oml.optimizers.rms_prop import RMSProp from oml.optimizers.nesterov import AccSGD from oml.datasouces.iterator import NumpyIterator from matplotlib import pyplot as plt data = load_boston() x = maxabs_scale(data['data']) t = data['target'] feature = x.shape[1] target = 1 data = np.hstack((x, t.reshape(-1, 1))) np.random.shuffle(data) train_data = data[:data.shape[0] // 2, :] test_data = data[data.shape[0] // 2:, :] train_iter = NumpyIterator(train_data, batch_size=10) test_iter = NumpyIterator(test_data)
groups = test_data.groupby(['Pclass']).size() groups.plot.bar() # models expects data to be normalized in order to perform better. # So we will try to normalize each features. # # In[ ]: data = train_data.copy() data["Pclass"] = data["Pclass"] - 1 data["Pclass"] = preprocessing.maxabs_scale(data["Pclass"]) print(data["Pclass"].value_counts()) # In[ ]: def handle_pclass(data): new_data = data new_data["Pclass"] = new_data["Pclass"] -1 new_data["Pclass"] = preprocessing.maxabs_scale(data["Pclass"]) return new_data data = train_data.copy() data = drop_survived(data) data = drop_passenger_id(data)
# print counts of candidate sets # grouped by length if False: counts = getLengthCounts(examples) print counts sys.exit(0) # get permuted features if requested if args.permute: totalX, totalY = permuteFeatures(examples, modelFeatures, length=args.permute) # otherwise get features in a regular format else: totalX, totalY = combineFeatures(examples, modelFeatures, verbose=True) # scaling data print "Scaling data..." totalX = preprocessing.maxabs_scale(totalX) # run kfold validation print "----------------------------------" print "Running kfold validation on all features" print "----------------------------------" kfoldValidation(10, np.array(totalX), np.array(totalY), classifier=clf, verbose=True)
def handle_pclass(data): new_data = data new_data["Pclass"] = new_data["Pclass"] -1 new_data["Pclass"] = preprocessing.maxabs_scale(data["Pclass"]) return new_data
def investigate( FILE, scale_features = 'maxabs', start = 1, end = None, show = False, images_dir = cfg.finvestig_images_dir, data_dir = cfg.finvestig_data_dir, results_dir = cfg.finvestig_results_dir): '''This produces a heatmap of the correlations between features. The correlation strength is computed using Pearson Correlation Coefficients. :param FILE: Features file. This should be in CSV format, with column 0 being the instance name and row zero being the names of the features. :param scale_features: There are various ways to scale the features data. The scaling is done column-wise (i.e. on each feature individually). default='maxabs'. - maxabs = Scale to [-1,1] - scale = Zero mean and unit stdev - minmax = Translate and scale to [0,1] - normalize = Normalize each feature to unit norm - robust = Shift outliers in according to interquartile range :param start: Starting feature to include in the correlations heatmap. default=1 :param end: Last feature to include in the correlations heatmap. default=None :param show: Whether to show images as they are being produced. default=False :param images_dir: Directory to dump images. default='../images' :param data_dir: Directory to dump data. default='../data' :param results_dir: Directory to dump results. default='../results' :return: Heatmap in PDF format. Plot is automatically saved. The filename of every saved output automatically has the input file names used to produce it. ''' ################################################################### # Section 1: Grabs Feature Data ################################################################### stamp = '%s' %(os.path.basename(FILE).split('.')[0]) with open("%s" %(FILE)) as f: reader = csv.reader(f, delimiter=",") data = list(reader) instances_mip = [os.path.basename(line[0]).split('.')[0] for line in data[1:]] features_mip = [line[start:end] for line in data[1:]] ################################################################### # Section 1B: Scale the feature/performance data ################################################################### # normalize = scale to unit norm # maxabs_scale = scale to [-1,1] # scale = zero mean scaled to std one if scale_features == 'scale': features_mip = preprocessing.scale(features_mip) elif scale_features == 'maxabs': features_mip = preprocessing.maxabs_scale(features_mip) elif scale_features == 'minmax': features_mip = preprocessing.minmax_scale(features_mip) elif scale_features == 'normalize': features_mip = preprocessing.normalize(features_mip) elif scale_features == 'robust': features_mip = preprocessing.robust_scale(features_mip) ################################################################### # Section 2A: Pearson Correlation Heatmap ################################################################### corr=np.corrcoef(features_mip,rowvar=False) mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True f, ax = plt.subplots(figsize=(11, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(corr, mask=mask, cmap=cmap, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, xticklabels=data[0][start:end], yticklabels=data[0][start:end]) plt.xticks(rotation=90) plt.tick_params(labelsize=6) plt.yticks(rotation=0) plt.title("Feature Pearson Correlation Heatmap") plt.savefig('%s/Correlation Heatmap_%s.pdf' %(images_dir,stamp), bbox_inches='tight', pad_inches=0) if show == True: plt.show()
def read(self, filename): self.__X = [] self.__Y = [] self.__info = [] first_row = True fd = file(filename) progress = pydev.FileProgress(fd, filename) raw_X = [] for row in pydev.foreach_row(fd, seperator=self.__seperator): progress.check_progress() # whether to ignore first row. if first_row and self.__ignore_first_row: first_row = False continue # check column count. if self.__expect_column_count < 0: self.__expect_column_count = len(row) if self.__target_column < 0: self.__target_column = self.__expect_column_count - 1 print >> sys.stderr, 'columns set to %d, target:%d' % ( self.__expect_column_count, self.__target_column) elif len(row) != self.__expect_column_count: continue # strip each columns. row = map(lambda x: x.strip(), row) # get x dict. id_value = [] v_size = 0 ignored_info = [] for rid, value in enumerate(row): # continue if target columns. if rid == self.__target_column: continue # continue if filter columns. if rid in self.__ignore_columns: ignored_info.append(value) continue # dense and id-value-sparse if self.__row_mode == DataReader.DenseValue: cid = rid elif self.__row_mode == DataReader.IVSparse: cid, value = value.split(':') cid = int(cid) if cid in self.__concrete_ids: # one-hot representation for key. # feature = id-value : 1 fid, value = self.__feature_trans.allocate_id( '#%03d:%s' % (cid, value)), 1 else: # feature = id : value fid, value = self.__feature_trans.allocate_id( '#%03d' % (cid)), float(value) id_value.append((fid, value)) if v_size < fid + 1: v_size = fid + 1 x = numpy.ndarray(shape=(v_size, )) x.fill(0) for fid, value in id_value: x[fid] = float(value) raw_X.append(x) # get Y if self.__concrete_target: row[self.__target_column] = self.__target_trans.allocate_id( row[self.__target_column]) y = row[self.__target_column] self.__Y.append(y) self.__info.append(self.__seperator.join(ignored_info)) progress.end_progress() # resize for each X. x_size = self.__feature_trans.size() for x in raw_X: new_x = numpy.ndarray(shape=(x_size, ), dtype=numpy.float32) new_x.fill(0) new_x[:x.shape[0]] = x self.__X.append(new_x) # resize Y if concrete label. if self.__target_one_hot: raw_Y = self.__Y self.__Y = [] y_size = self.__target_trans.size() for y in raw_Y: new_y = numpy.ndarray(shape=(y_size, ), dtype=numpy.float32) new_y.fill(0) new_y[int(y)] = 1. self.__Y.append(new_y) # transform X to numpy.ndarray self.__X = numpy.array(self.__X) # preprocessing. if self.__maxabs_scale: print >> sys.stderr, 'Do maxabs_scale' self.__X = preprocessing.maxabs_scale(self.__X) # make Y as ndarray self.__Y = numpy.array(self.__Y).astype(numpy.float32) #self.__feature_trans.debug() #self.__target_trans.debug() print >> sys.stderr, 'Data load [ %d(records) x %d(features) ]' % (len( self.__X), len(self.__X[0]))
#extract data from file docBuff = [] data = [] with open("data.txt") as dataFile: for row in dataFile: docBuff.append(row.rstrip('\n')) for i in docBuff: data.append(i.split(",")) #clustering fase k = 3 km = cluster.KMeans(n_clusters=k) km.fit(preprocessing.maxabs_scale(data)) labels = km.labels_ centroids = km.cluster_centers_ print("k: " + str(k)) print(labels) cluster_centers = np.sort(km.cluster_centers_, axis=0) print(cluster_centers) #plot clustered data dataMod = np.array(data) for i in range(k): data = dataMod[np.where(labels == i)] plt.plot(data[:, 0], data[:, 3], 'o')
##############logistic regression ### clf = LogisticRegression(penalty='l2', dual=False, solver='liblinear', C=1.1) ### labels = ['Sex', 'Age', 'Fare', 'SibSp', 'Parch', 'Has_Family', 'Embarked_S', 'Embarked_C', 'Embarked_Q'] #['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] ### cv_score = cross_val_score(clf, ### train_data[labels], ### train_data['Survived'], cv=3) ### print 'logistic regression cv_score=', cv_score ### clf.fit(train_data[labels], train_data['Survived']) ### lr_predict = clf.predict(test_data[labels]) ### test_data['Survived'] = lr_predict ### test_data.to_csv('lr_class.csv', columns=['PassengerId', 'Survived'], index=False) ##############SVM labels = ['Sex', 'Age', 'Fare', 'SibSp', 'Parch', 'Has_Family', 'Embarked_S', 'Embarked_C', 'Embarked_Q'] #['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] normalized_train_data = maxabs_scale(train_data[labels].values) normalized_test_data = maxabs_scale(test_data[labels].values) ####### exhaustively CV grid search ### parameters_grid = [{'kernel': ['rbf'], 'gamma': [1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 50, 100], 'C': [0.1, 0.5, 1, 5, 10, 50, 100]}, ### {'kernel': ['sigmoid'], 'gamma': [1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 50, 100], 'C': [0.1, 0.5, 1, 5, 10, 50, 100], 'coef0':[1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 50, 100]}, ### {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] ### clf = SVC() ### grid_search = GridSearchCV(clf, parameters_grid, cv=3, n_jobs=2) ### grid_search.fit(normalized_train_data, train_data['Survived']) ### print grid_search.best_params_, grid_search.best_score_ #### #### clf = SVC(C=5, kernel='rbf', gamma=1) #### cv_score = cross_val_score(clf, #### normalized_train_data, #### train_data['Survived'], cv=3)
def investigate(FILE, scale_features='maxabs', show=False, images_dir=cfg.finvestig_images_dir, data_dir=cfg.finvestig_data_dir, results_dir=cfg.finvestig_results_dir): '''Multidimensional scaling is a technique to map high dimeinsional data to lower dimensions. The heuristic used is to preserve pairwise distances as well as possible in this mapping. In this function, it is used to: 1) Understand the dimensionality of the feature space according to MDS 2) Visualize how the data maps to 2D and 3D Further functionality can be added to more closely match the goals of the Principal Component Analysis function. :param FILE: Features file. This should be in CSV format, with column 0 being the instance name and row zero being the names of the features. :param scale_features: There are various ways to scale the features data. The scaling is done column-wise (i.e. on each feature individually). default='maxabs'. - maxabs = Scale to [-1,1] - scale = Zero mean and unit stdev - minmax = Translate and scale to [0,1] - normalize = Normalize each feature to unit norm - robust = Shift outliers in according to interquartile range :param show: Whether to show images as they are being produced. default=False :param images_dir: Directory to dump images. default='../images' :param data_dir: Directory to dump data. default='../data' :param results_dir: Directory to dump results. default='../results' :return: Plots all returned in PDF format. Plots are automatically saved: - Plot 1: Information retained with respect to dimension being mapped to x-axis is scaled by heuristically. Change this if plot isn't nice. - Plot 2: MDS in d=2 - Plot 3: MDS in d=2, with names of instances attached to points - Plot 4: MDS in d=3. Plot is not automatically saved. ''' ################################################################### # Section 1: Grabs Feature Data ################################################################### stamp = '%s' % (os.path.basename(FILE).split('.')[0]) print(stamp) with open("%s" % (FILE)) as f: reader = csv.reader(f, delimiter=",") data = list(reader) instances_mip = [ os.path.basename(line[0]).split('.')[0] for line in data[1:] ] features_mip = [line[1:] for line in data[1:]] ################################################################### # Section 1B: Scale the feature/performance data ################################################################### # normalize = scale to unit norm # maxabs_scale = scale to [-1,1] # scale = zero mean scaled to std one if scale_features == 'scale': features_mip = preprocessing.scale(features_mip) elif scale_features == 'maxabs': features_mip = preprocessing.maxabs_scale(features_mip) elif scale_features == 'minmax': features_mip = preprocessing.minmax_scale(features_mip) elif scale_features == 'normalize': features_mip = preprocessing.normalize(features_mip) elif scale_features == 'robust': features_mip = preprocessing.robust_scale(features_mip) ################################################################### # Section 2A: MDS, Find the number of dimensions to map to # This can take a few minutes, so feel free to grab a coffee at this point ################################################################### # Below, "information" is defined as (1-stress/scale) # The scale here is stress[dimension(2)] # Information is a scalar in [0,1] ### Stress is defined as the sum of squared difference between ### distances in the embedded space and distances in the original space max_dim = int(ceil(0.1 * len(features_mip[0]))) print() print("Max dimension projecting to is %s" % (max_dim)) print() stress, dimension = [], [] fig, ax = plt.subplots() for i in range(2, max_dim + 1): # choose the range of dimensions to map to print('Projecting to dimension %s' % i) mds = manifold.MDS(i) # number of dimensions to map to proj = mds.fit_transform(features_mip).T stress.append(mds.stress_) dimension.append(i) information = [1 - i / stress[0] for i in stress] print() n = 1 for line in information: n += 1 if line >= 0.7: #set a 70% threshold for information kept print('%.3f information is kept at dimension %s' % (line, n)) break n = 1 for line in information: n += 1 if line >= 0.9: #set a 90% threshold for information kept print('%.3f information is kept at dimension %s' % (line, n)) break n = 1 for line in information: n += 1 if line >= 0.95: #set a 95% threshold for information kept print('%.3f information is kept at dimension %s' % (line, n)) break extraticks = [0.7, 0.9, 0.95] plt.axhline(y=0.7, color='r', linestyle='-') plt.axhline(y=0.9, color='r', linestyle='-') plt.axhline(y=0.95, color='r', linestyle='-') plt.plot(dimension, information, 'bo') plt.yticks(list(plt.yticks()[0]) + extraticks) plt.ylim((-0.1, 1.1)) plt.xlabel('Dimension') plt.ylabel('Information Retained') plt.title("MDS Normalized Retained Information") plt.savefig('%s/MDS_information_%s.pdf' % (images_dir, stamp), bbox_inches='tight', pad_inches=0) if show == True: plt.show() plt.close() ################################################################### # Section 2B: MDS, Draw 2D and 3D MDS plots ################################################################### print() print('Currently producing some more images...') # Draw 2D MDS mds = manifold.MDS(2) # number of dimensions to map to proj = mds.fit_transform(features_mip).T fig, ax = plt.subplots() plt.plot(proj[0], proj[1], 'ro') plt.title("Feature MDS") plt.xscale("symlog") plt.yscale("symlog") plt.axis('tight') plt.savefig('%s/MDS_%s.pdf' % (images_dir, stamp), bbox_inches='tight', pad_inches=0) if show == True: plt.show() plt.close() # Do the same as the above, just with names of instances attached to points proj = mds.fit_transform(features_mip).T fig, ax = plt.subplots() for i, txt in enumerate(instances_mip): ax.annotate(txt, (proj[0][i], proj[1][i])) plt.plot(proj[0], proj[1], 'ro') plt.title("Feature MDS") plt.xscale("symlog") plt.yscale("symlog") plt.axis('tight') plt.savefig('%s/MDS (names)_%s.pdf' % (images_dir, stamp), bbox_inches='tight', pad_inches=0) if show == True: plt.show() plt.close() # Draw 3D MDS. 3D Images will not save automatically. mds = manifold.MDS(3) # number of dimensions to map to proj = mds.fit_transform(features_mip).T fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(proj[0], proj[1], proj[2]) ax.set_xscale("symlog") ax.set_yscale("symlog") ax.set_zscale("symlog") if show == True: plt.show() plt.close()
obj = json.loads(line) team = int(obj['team']) vec = np.zeros(num_features) for tuple_key, count in obj['tuples'].iteritems(): val = count val = min(max(val - WEIGHT_GATE, 0), WEIGHT_CEIL) val = 0 if val > WEIGHT_IGNORE else val if BINARY_WEIGHT_PRE: val = 1 if val > 0 else 0 vec[get_tuple_key(tuple_key)] = val team_features[team_index[team]] = vec print team_features.shape print team_features print "Normalizing matrix..." # TODO norm maxabs_scale(team_features, axis=0, copy=False) # scale each feature maxabs_scale(team_features, axis=1, copy=False) # scale each team print team_features csc = csc_matrix(team_features) if DO_SVD: print "Computing SVD..." ut, s, vt = sparsesvd(csc, KEEP_SV) #team_feat_dense = np.dot(np.transpose(ut), np.square(np.diag(s))) #team_feat_dense = np.transpose(ut) team_feat_dense = np.dot(np.transpose(ut), np.diag(s)) print "Dense team feature vectors:", team_feat_dense.shape print team_feat_dense actual_features = len(s) print "%s singular values, min: %s, max: %s" % (actual_features, min(s), max(s)) else:
import numpy as np from sklearn.datasets import fetch_20newsgroups dd = fetch_20newsgroups(subset='all') print(dd.description) print(dd.keys()) np.unique(dd.target, return_inverse=True) from pprint import pprint pprint(list(dd.target_names)) dd.data[1] dd.target_names[dd.target[1]] from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale x = (np.arange(10, dtype=np.float) - 3).reshape(-1, 1) df = pd.DataFrame(np.hstack([x, scale(x), robust_scale(x), minmax_scale(x), maxabs_scale(x)]), columns=["x", "scale(x)", "robust_scale(x)", "minmax_scale(x)", "maxabs_scale(x)"]) df from sklearn.preprocessing import normalize x = np.vstack([np.arange(5, dtype=float) - 20, np.arange(5, dtype=float) - 2]).T y1 = scale(x) y2 = normalize(x) print("original x:\n", x) print("scale:\n", y1) print("norms (scale)\n", np.linalg.norm(y1, axis=1)) print("normlize:\n", y2) print("norms (normalize)\n", np.linalg.norm(y2, axis=1))