def partialFitMiniBatchKmeans(training_file,categorical_features,label,maxLines,ks):
    stop=False
    cont=0
    km=[]
    with open(training_file,'r') as f:
                while stop==False:
                    print('Training section - reading data from file ...')
                    if cont==0:
                        header=f.readline().rstrip().split(',')
                        idx=dSu.findMultiple(header, categorical_features)
                        idx+=dSu.listStrFind(header,label)
                    cont+=1
                    
                    data=[]
                    stop=True
                    print(cont)
                    for line in f:
                        stop=False
                        temp=line.rstrip().split(',')
                        if dSu.listStrFind(temp,'NA')==[]:
                            temp=[float(temp[i]) for i in range(len(temp)) if not i in idx]
                            data.append(temp)
                        if len(data)==maxLines:
                            break
                    if stop==False:
                        km=MiniBatchKMeans(init='k-means++', n_clusters=ks, batch_size=len(data),
                                n_init=10, max_no_improvement=10, verbose=0)
                        km.partial_fit(data)
    return km
def crossvalidationSPF(trainFile,data_path,folds):
    '''
    This function takes a training file and splits it into training
    and testing data sets that are stored under a folder called temp
    in the data path provided. The number of files depends on the folds provided
    '''
    #All this section should be a function
    with open(trainFile,'r') as f:
        data=[]
        for line in f:
            data.append(line)
    temp_path=data_path+'temp_cv'
    try: 
        os.mkdir(temp_path)
    except:
        print('folder already exists, will not create crossvalidation files')
        files=os.listdir(temp_path)
        idx=dSu.listStrFind(files, 'train')
        training_files=[ temp_path+'/'+files[i] for i in idx]
        idx=dSu.listStrFind(files, 'test')
        testing_files=[ temp_path+'/'+files[i] for i in idx]
        return training_files,testing_files
        
    #Drop the header
    header=data.pop(0)
    
    #Create the crossvalidation files
    kf=cross_validation.KFold(len(data),n_folds=folds)
    cont=1
    
    training_files=[]
    testing_files=[]
    for traf,tesf in kf:
        temp_train_filename=temp_path+'/'+'cv_train%d.csv'%(cont)
        temp_test_filename=temp_path+'/'+'cv_test%d.csv'%(cont)
        cont+=1
        temp_file=open(temp_train_filename,'w')
        training_files.append(temp_train_filename)
        temp_file.write(header)
        for i in traf:
            temp_file.write(data[i])
        temp_file.close()
        temp_file=open(temp_test_filename,'w')
        testing_files.append(temp_test_filename)
        temp_file.write(header)
        for i in tesf:
            temp_file.write(data[i])
            
    print('Kfold-crossvalidation files stored')
    print(training_files)
    print(testing_files)
    
    return training_files,testing_files
Exemple #3
0
def signalsLoader(path):
    filtering=False
    
    
    
    
    
    listOfFiles=os.listdir(path)        
    inds=dSu.listStrFind(listOfFiles, '.csv')
    listOfFiles=[ listOfFiles[i] for i in inds]
    
    signals={'data':[],'filename':[]}
    
    for filename in listOfFiles:
        filepath='%s%s'%(path,filename)
        signals['data'].append(dR.csvReader(filepath, ',', 0)['data'])
        signals['filename'].append(filename)
        for i in range(len(signals['data'][-1])):
            temp=signals['data'][-1][i]
            for i2 in range(4):
                signals['data'][-1][i][i2]=float(temp[i2])
                
        for i in range(len(signals['data'])):
            signals['data'][i]=np.array(signals['data'][i])
    return signals
def dataExtract(filename):
    '''
    Extracts the data for the activity recognition study
    '''
    
    tempData=dR.csvReader(filename)
    data=[]
    labels=[]
    
    features=tempData['header'][2][3:]
    
    for vals in tempData['data']:
        data.append([float(i) for i in vals[3:]])
        labels.append(int(vals[0]))
        
        
    #Excluding missing data and the missing features
    inds=dSu.listStrFind(features, 'm_')
    ind=min(inds)
    
    #Missing data extraction
    features=features[:ind]
    data=np.array(data)
    missingData=np.sum(data[:,ind:-1],1)
    rows=np.argwhere(missingData==8).ravel()
    data=data[rows,:ind]
    labels=np.array(labels)
    labels=labels[rows]
    return data,labels,features
def freqsF(data,pattern):
    """
      >>> freqsF([1,1,1,1,2],1)
      4
      >>> freqsF([1,1,1,1,2],2)
      1
      >>> freqsF([1,5,5,1,2],5)
      2
      >>> freqsF([1,5,5,1,2],6)
      1
    """
    freqs=list();
    indices=list()
    temp=list()
    results=list()
    
    if type(pattern) is list: 
        if not pattern:
            print('pattern is empty, stopping execution')
            sys.exit()
            
        if type(pattern[0]) is unicode:
            results={}
            for pat in pattern:
                results[pat]=len(dataSu.listStrFind(data,pat))
            return results
        
        if not(type(pattern[0])is int):
            for i in range(0,len(pattern)):
                temp=[item for item in range(len(data)-len(pattern[i])) if data[item:item+len(pattern[i])]==pattern[i]]
                freqs.append(len(temp))
                indices.append(temp)
                
            results={'indices':indices,'counts':freqs}
            return results
        else :
            for i in range(0,len(pattern)):
                indices=[item for item in range(len(data)) if data[item]==pattern[i]]
                freqs.append(len(temp))
                indices.append(temp)
            results={'indices':indices,'counts':freqs}
            return results
#     if type(pattern) is int or type(pattern) is float:
    else:
        return sum([1 for i in data if i==pattern])
Exemple #6
0
         
    for file in filesList:
        Inds=[]
        filename=dataPath+'/'+file
        dataset=pickle.load(open(filename,'rb'))
        for i in range(len(dataset['Ids'])):
            try :
                if Ids.index(dataset['Ids'][i]):
                    continue
            except:
                Inds.append(i)
        for i in range(len(Inds)-1,-1,-1):
            dataset['data']=np.delete(dataset['data'],Inds[i],1)
        dataset['Ids']=Ids
         
        indsMissing=dSu.listStrFind(dataset['header'], 'm_')
        indsMissing=dSu.findMultiple(dataset['Ids'], indsMissing)
         
        if Inds!=[]:
            pickle.dump(dataset,open(filename,'wb'))
    print('Done modifying the data files')
             
         
             
for file in filesList:
    filename=dataPath+'/'+file
    dataset=pickle.load(open(filename,'rb'))
    inds=dSu.find(dataset['labels'], lambda x:x==activity)
    hist=dA.partialhist()
    
    
def missingEM(filename,temp_path,maxK,categorical_features,label,kfolds=2):
    '''
    This function uses EM to fill in missing values in a data set. 
    First it calculates mini-batch kmeans on the data set without
    the missing data. This is done in crossvalidation fashion so that the best
    k can be selected. Once the best k is selected a new kmeans model is estimated
    once the centroids are obtained, the values replace accordingly the missing data
    '''
    #Values higher than this will cause the silhouette to run too slow
    #I should probably modify the silhouette
    #Currently it is not very well implemented
    #Just the average is actually wrong
    #Also remember about the gap statistics
    #Maybe I should implement it
    
    maxLines=1000
    
    filesList=os.listdir(temp_path)
    if dSu.listStrFind(filesList, 'meanSils') ==[]:
        print('Pickle not found starting the crossvalidated EM to determine number of K')
        training_files,testing_files=crossvalidationSPF(filename,temp_path,kfolds)
        meanSils=[[0 for i2 in range(kfolds)]for i in range(0,maxK)]
        for fId in range(len(training_files)):
            data=[]
            km=[[]for i in range(maxK)]
            sils=[[] for i in range(maxK)]
            #Training section for all of the data files
            #In this section we get the kmeans models
            #For all of the different k's once we get all of the models
            #We get the silhouette score on the 
            #testing data
            cont=0
            stop=False
            with open(training_files[fId],'r') as f:
                while stop==False:
                    print('Training section - reading data from file ...')
                    if cont==0:
                        header=f.readline().rstrip().split(',')
                        idx=dSu.findMultiple(header, categorical_features)
                        idx+=dSu.listStrFind(header,label)
                    cont+=1
                    
                    data=[]
                    stop=True
                    print(cont)
                    for line in f:
                        stop=False
                        temp=line.rstrip().split(',')
                        if dSu.listStrFind(temp,'NA')==[]:
                            temp=[float(temp[i]) for i in range(len(temp)) if not i in idx]
                            data.append(temp)
                        if len(data)==maxLines:
                            break
                    if stop==False:
                        for kId in range(2,maxK):
                            if km[kId]==[]:
                                km[kId]=MiniBatchKMeans(init='k-means++', n_clusters=kId, batch_size=len(data),
                                n_init=10, max_no_improvement=10, verbose=0)
                            km[kId].partial_fit(data)
                        
    #                 print('Adding the next piece of code to terminate early')
    #                 break
                
                #Testing section
                #Here now that we have the models we simply test them
                #however we store the values and later we average them
            print(temp)
            cont=0
            stop=False
            with open(testing_files[fId],'r') as f:
                while stop==False:
                    print('Testing section reading data from file ...')
                    if cont==0:
                        header=f.readline().rstrip().split(',')
                        idx=dSu.findMultiple(header, categorical_features)
                        idx+=dSu.listStrFind(header,label)
                    cont+=1
                    data=[]
                    stop=True
                    for line in f:
                        stop=False
                        temp=line.rstrip().split(',')
                        if dSu.listStrFind(temp,'NA')==[]:                
                            temp=[float(temp[i]) for i in range(len(temp)) if not i in idx]
                            data.append(temp)
                        if len(data)==maxLines:
                            break
                    if stop==False:
                        for kId in range(2,len(km)):
                            labels=km[kId].predict(data)
                            print('Computing silhouette for %d'%(kId))    
                            sils[kId].append(silhouette(data,labels,metric='correlation'))
    #                 print('Adding the next piece of code to terminate early')
    #                 break
                
            
            for kId in range(2,len(km)):
                meanSils[kId][fId]=np.mean(sils[kId])
        print(meanSils)
        pickle.dump(meanSils,open(temp_path+'/'+'meanSils','wb'))
        print('remember the resulst where stored in %s'%(temp_path+'/'+'meanSils'))
        print('kmean models built')
    elif dSu.listStrFind(filesList, 'meanSils') !=[]:
        print('A pickle meanSils was found in %s, proceeding with the missing data imputation'%(temp_path))
        meanSils=pickle.load(open(temp_path+'/'+'meanSils','rb'))
        meanSils=np.array(meanSils[2:][:])+1
        aveMeanSils=np.mean(meanSils,1)
        ind=np.argmax(aveMeanSils)
        k=ind+2
        print('The best k is %d'%(k))
        #In the next section I build the Kmeans model using all of the training data available
        #it does not matter that is not crossvalidated or anything like that
        cont=0
        stop=False
        kmns=partialFitMiniBatchKmeans(filename,categorical_features,label,maxLines,k)
        with open(filename,'r') as f:
                while stop==False:
                    print('Imputing missing data')
                    data=[]
                    
                    if cont==0:
                        header=f.readline().rstrip().split(',')
                        #here I'm adding the categorical features ids
                        #so that later I do not consider them
                        #however I have to later add them back to the data once
                        #the missing imputation has been worked out
                        #hence there is probably no need to remove 
                        #this from the header
                        #also finish checking that the data is being stored
                        #correctly before moving forward
                        
                        idx=dSu.findMultiple(header, categorical_features)
                        idx+=dSu.listStrFind(header,label)
                        data=[header]
                    cont+=1
                    stop=True
                    print(cont)
                    for line in f:
                        stop=False
                        temp=line.rstrip().split(',')
                        temp=[temp[i] for i in range(len(temp)) if i not in idx]
                        if dSu.listStrFind(temp,'NA'):
                            vec=[ [int(key),float(val)] for (key,val) in enumerate(temp) if val!='NA']
                            vecs=np.array([i[1] for i in vec])
                            idVec=[i[0] for i in vec]
                            cents=kmns.cluster_centers_
                            dist=[]
                            for i in range(np.shape(cents)[0]):
                                tempVec=[ cents[i,i2] for i2 in idVec]
                                dist.append(distance.euclidean(tempVec,vecs))
                            ind=np.argmin(dist)
                            
                            #Now replacing here the missing data
                            inds=dSu.listStrFind(temp, 'NA')
                            for i in inds:
                                temp[i]=str(cents[ind,i])
                            #Adding the categorica data back
                            for i in idx:
                                lineTemp=line.rstrip().split(',')                                
                                temp.insert(i,lineTemp[i])
                            data.append(temp)
                        else:
                            #Adding the categorica data back
                            for i in idx:
                                lineTemp=line.rstrip().split(',')                                
                                temp.insert(i,lineTemp[i])
                            data.append(temp)
                            
                        if len(data)==maxLines:
                            #Here I have to write or append to a file the array data
                            #file=open(filename[:-4]+'noNA.dat','a')
                            if cont==1:
                                writeState='w'
                            else:
                                writeState='a'
                            dW.writeLoL2csv(data,filename=filename[:-4]+'noNA.csv', writeState=writeState)
                            break
                    #Erase this braek later
                    #break
        print('Imputed missing data')
        print('Files with the extension -noNA.csv where added to your working folder')