def partialFitMiniBatchKmeans(training_file,categorical_features,label,maxLines,ks): stop=False cont=0 km=[] with open(training_file,'r') as f: while stop==False: print('Training section - reading data from file ...') if cont==0: header=f.readline().rstrip().split(',') idx=dSu.findMultiple(header, categorical_features) idx+=dSu.listStrFind(header,label) cont+=1 data=[] stop=True print(cont) for line in f: stop=False temp=line.rstrip().split(',') if dSu.listStrFind(temp,'NA')==[]: temp=[float(temp[i]) for i in range(len(temp)) if not i in idx] data.append(temp) if len(data)==maxLines: break if stop==False: km=MiniBatchKMeans(init='k-means++', n_clusters=ks, batch_size=len(data), n_init=10, max_no_improvement=10, verbose=0) km.partial_fit(data) return km
def crossvalidationSPF(trainFile,data_path,folds): ''' This function takes a training file and splits it into training and testing data sets that are stored under a folder called temp in the data path provided. The number of files depends on the folds provided ''' #All this section should be a function with open(trainFile,'r') as f: data=[] for line in f: data.append(line) temp_path=data_path+'temp_cv' try: os.mkdir(temp_path) except: print('folder already exists, will not create crossvalidation files') files=os.listdir(temp_path) idx=dSu.listStrFind(files, 'train') training_files=[ temp_path+'/'+files[i] for i in idx] idx=dSu.listStrFind(files, 'test') testing_files=[ temp_path+'/'+files[i] for i in idx] return training_files,testing_files #Drop the header header=data.pop(0) #Create the crossvalidation files kf=cross_validation.KFold(len(data),n_folds=folds) cont=1 training_files=[] testing_files=[] for traf,tesf in kf: temp_train_filename=temp_path+'/'+'cv_train%d.csv'%(cont) temp_test_filename=temp_path+'/'+'cv_test%d.csv'%(cont) cont+=1 temp_file=open(temp_train_filename,'w') training_files.append(temp_train_filename) temp_file.write(header) for i in traf: temp_file.write(data[i]) temp_file.close() temp_file=open(temp_test_filename,'w') testing_files.append(temp_test_filename) temp_file.write(header) for i in tesf: temp_file.write(data[i]) print('Kfold-crossvalidation files stored') print(training_files) print(testing_files) return training_files,testing_files
def signalsLoader(path): filtering=False listOfFiles=os.listdir(path) inds=dSu.listStrFind(listOfFiles, '.csv') listOfFiles=[ listOfFiles[i] for i in inds] signals={'data':[],'filename':[]} for filename in listOfFiles: filepath='%s%s'%(path,filename) signals['data'].append(dR.csvReader(filepath, ',', 0)['data']) signals['filename'].append(filename) for i in range(len(signals['data'][-1])): temp=signals['data'][-1][i] for i2 in range(4): signals['data'][-1][i][i2]=float(temp[i2]) for i in range(len(signals['data'])): signals['data'][i]=np.array(signals['data'][i]) return signals
def dataExtract(filename): ''' Extracts the data for the activity recognition study ''' tempData=dR.csvReader(filename) data=[] labels=[] features=tempData['header'][2][3:] for vals in tempData['data']: data.append([float(i) for i in vals[3:]]) labels.append(int(vals[0])) #Excluding missing data and the missing features inds=dSu.listStrFind(features, 'm_') ind=min(inds) #Missing data extraction features=features[:ind] data=np.array(data) missingData=np.sum(data[:,ind:-1],1) rows=np.argwhere(missingData==8).ravel() data=data[rows,:ind] labels=np.array(labels) labels=labels[rows] return data,labels,features
def freqsF(data,pattern): """ >>> freqsF([1,1,1,1,2],1) 4 >>> freqsF([1,1,1,1,2],2) 1 >>> freqsF([1,5,5,1,2],5) 2 >>> freqsF([1,5,5,1,2],6) 1 """ freqs=list(); indices=list() temp=list() results=list() if type(pattern) is list: if not pattern: print('pattern is empty, stopping execution') sys.exit() if type(pattern[0]) is unicode: results={} for pat in pattern: results[pat]=len(dataSu.listStrFind(data,pat)) return results if not(type(pattern[0])is int): for i in range(0,len(pattern)): temp=[item for item in range(len(data)-len(pattern[i])) if data[item:item+len(pattern[i])]==pattern[i]] freqs.append(len(temp)) indices.append(temp) results={'indices':indices,'counts':freqs} return results else : for i in range(0,len(pattern)): indices=[item for item in range(len(data)) if data[item]==pattern[i]] freqs.append(len(temp)) indices.append(temp) results={'indices':indices,'counts':freqs} return results # if type(pattern) is int or type(pattern) is float: else: return sum([1 for i in data if i==pattern])
for file in filesList: Inds=[] filename=dataPath+'/'+file dataset=pickle.load(open(filename,'rb')) for i in range(len(dataset['Ids'])): try : if Ids.index(dataset['Ids'][i]): continue except: Inds.append(i) for i in range(len(Inds)-1,-1,-1): dataset['data']=np.delete(dataset['data'],Inds[i],1) dataset['Ids']=Ids indsMissing=dSu.listStrFind(dataset['header'], 'm_') indsMissing=dSu.findMultiple(dataset['Ids'], indsMissing) if Inds!=[]: pickle.dump(dataset,open(filename,'wb')) print('Done modifying the data files') for file in filesList: filename=dataPath+'/'+file dataset=pickle.load(open(filename,'rb')) inds=dSu.find(dataset['labels'], lambda x:x==activity) hist=dA.partialhist()
def missingEM(filename,temp_path,maxK,categorical_features,label,kfolds=2): ''' This function uses EM to fill in missing values in a data set. First it calculates mini-batch kmeans on the data set without the missing data. This is done in crossvalidation fashion so that the best k can be selected. Once the best k is selected a new kmeans model is estimated once the centroids are obtained, the values replace accordingly the missing data ''' #Values higher than this will cause the silhouette to run too slow #I should probably modify the silhouette #Currently it is not very well implemented #Just the average is actually wrong #Also remember about the gap statistics #Maybe I should implement it maxLines=1000 filesList=os.listdir(temp_path) if dSu.listStrFind(filesList, 'meanSils') ==[]: print('Pickle not found starting the crossvalidated EM to determine number of K') training_files,testing_files=crossvalidationSPF(filename,temp_path,kfolds) meanSils=[[0 for i2 in range(kfolds)]for i in range(0,maxK)] for fId in range(len(training_files)): data=[] km=[[]for i in range(maxK)] sils=[[] for i in range(maxK)] #Training section for all of the data files #In this section we get the kmeans models #For all of the different k's once we get all of the models #We get the silhouette score on the #testing data cont=0 stop=False with open(training_files[fId],'r') as f: while stop==False: print('Training section - reading data from file ...') if cont==0: header=f.readline().rstrip().split(',') idx=dSu.findMultiple(header, categorical_features) idx+=dSu.listStrFind(header,label) cont+=1 data=[] stop=True print(cont) for line in f: stop=False temp=line.rstrip().split(',') if dSu.listStrFind(temp,'NA')==[]: temp=[float(temp[i]) for i in range(len(temp)) if not i in idx] data.append(temp) if len(data)==maxLines: break if stop==False: for kId in range(2,maxK): if km[kId]==[]: km[kId]=MiniBatchKMeans(init='k-means++', n_clusters=kId, batch_size=len(data), n_init=10, max_no_improvement=10, verbose=0) km[kId].partial_fit(data) # print('Adding the next piece of code to terminate early') # break #Testing section #Here now that we have the models we simply test them #however we store the values and later we average them print(temp) cont=0 stop=False with open(testing_files[fId],'r') as f: while stop==False: print('Testing section reading data from file ...') if cont==0: header=f.readline().rstrip().split(',') idx=dSu.findMultiple(header, categorical_features) idx+=dSu.listStrFind(header,label) cont+=1 data=[] stop=True for line in f: stop=False temp=line.rstrip().split(',') if dSu.listStrFind(temp,'NA')==[]: temp=[float(temp[i]) for i in range(len(temp)) if not i in idx] data.append(temp) if len(data)==maxLines: break if stop==False: for kId in range(2,len(km)): labels=km[kId].predict(data) print('Computing silhouette for %d'%(kId)) sils[kId].append(silhouette(data,labels,metric='correlation')) # print('Adding the next piece of code to terminate early') # break for kId in range(2,len(km)): meanSils[kId][fId]=np.mean(sils[kId]) print(meanSils) pickle.dump(meanSils,open(temp_path+'/'+'meanSils','wb')) print('remember the resulst where stored in %s'%(temp_path+'/'+'meanSils')) print('kmean models built') elif dSu.listStrFind(filesList, 'meanSils') !=[]: print('A pickle meanSils was found in %s, proceeding with the missing data imputation'%(temp_path)) meanSils=pickle.load(open(temp_path+'/'+'meanSils','rb')) meanSils=np.array(meanSils[2:][:])+1 aveMeanSils=np.mean(meanSils,1) ind=np.argmax(aveMeanSils) k=ind+2 print('The best k is %d'%(k)) #In the next section I build the Kmeans model using all of the training data available #it does not matter that is not crossvalidated or anything like that cont=0 stop=False kmns=partialFitMiniBatchKmeans(filename,categorical_features,label,maxLines,k) with open(filename,'r') as f: while stop==False: print('Imputing missing data') data=[] if cont==0: header=f.readline().rstrip().split(',') #here I'm adding the categorical features ids #so that later I do not consider them #however I have to later add them back to the data once #the missing imputation has been worked out #hence there is probably no need to remove #this from the header #also finish checking that the data is being stored #correctly before moving forward idx=dSu.findMultiple(header, categorical_features) idx+=dSu.listStrFind(header,label) data=[header] cont+=1 stop=True print(cont) for line in f: stop=False temp=line.rstrip().split(',') temp=[temp[i] for i in range(len(temp)) if i not in idx] if dSu.listStrFind(temp,'NA'): vec=[ [int(key),float(val)] for (key,val) in enumerate(temp) if val!='NA'] vecs=np.array([i[1] for i in vec]) idVec=[i[0] for i in vec] cents=kmns.cluster_centers_ dist=[] for i in range(np.shape(cents)[0]): tempVec=[ cents[i,i2] for i2 in idVec] dist.append(distance.euclidean(tempVec,vecs)) ind=np.argmin(dist) #Now replacing here the missing data inds=dSu.listStrFind(temp, 'NA') for i in inds: temp[i]=str(cents[ind,i]) #Adding the categorica data back for i in idx: lineTemp=line.rstrip().split(',') temp.insert(i,lineTemp[i]) data.append(temp) else: #Adding the categorica data back for i in idx: lineTemp=line.rstrip().split(',') temp.insert(i,lineTemp[i]) data.append(temp) if len(data)==maxLines: #Here I have to write or append to a file the array data #file=open(filename[:-4]+'noNA.dat','a') if cont==1: writeState='w' else: writeState='a' dW.writeLoL2csv(data,filename=filename[:-4]+'noNA.csv', writeState=writeState) break #Erase this braek later #break print('Imputed missing data') print('Files with the extension -noNA.csv where added to your working folder')