def clusteringA(clustMeta, dir_c, filenames): #os.mkdir(dir_c, 0777) stats, dfn = clustering(clustMeta, filenames) # match clusters to data print 'Opening a file with the data on the questions' df = pd.read_csv(filenames['input']) print 'Matching the data on the questions with the clusters' matchClusters(dir_c, df, stats, dfn, filenames['out']) #TODO prediction using clusters n dfpca = pd.read_csv(dir_c + 'pca.csv', header=None) #print dfpca.shape #print dfpca[0:12] test = dfpca[0:50] print len(test) n_neighbors = 3 dfstats = pd.read_csv(filenames['stats']) #dfstats = dfstats[dfstats['questions'].str.contains("questions") == False] #print dfstats df = pd.read_csv(filenames['clusters']) neigh = NearestNeighbors(n_neighbors=n_neighbors) neigh.fit(dfstats[['x','y']]) #print test closest = neigh.kneighbors(test) #TODO: dimension mismatching data = cp.calcAccuracy(dfstats, closest, df, n_neighbors, test) csr.to_csv(data, dir_c + 'predictions.csv') for datum in data: print datum
def clusteringA(clustMeta, dir_c, filenames): #os.mkdir(dir_c, 0777) stats, dfn = clustering(clustMeta, filenames) # match clusters to data print 'Opening a file with the data on the questions' df = pd.read_csv(filenames['input']) print 'Matching the data on the questions with the clusters' matchClusters(dir_c, df, stats, dfn, filenames['out']) #TODO prediction using clusters n dfpca = pd.read_csv(dir_c + 'pca.csv', header=None) #print dfpca.shape #print dfpca[0:12] test = dfpca[0:50] print len(test) n_neighbors = 3 dfstats = pd.read_csv(filenames['stats']) #dfstats = dfstats[dfstats['questions'].str.contains("questions") == False] #print dfstats df = pd.read_csv(filenames['clusters']) neigh = NearestNeighbors(n_neighbors=n_neighbors) neigh.fit(dfstats[['x', 'y']]) #print test closest = neigh.kneighbors(test) #TODO: dimension mismatching data = cp.calcAccuracy(dfstats, closest, df, n_neighbors, test) csr.to_csv(data, dir_c + 'predictions.csv') for datum in data: print datum
def runMinibatch(minibatch, cls_stats, classifiers, all_classes, losses1, losses2, x1, x2): for i, (df_small, y_small) in enumerate(minibatch): tick = time.time() #TODO calcualte features for df_small X_train, X_test, y_train, y_test = train_test_split(df_small, y_small.astype("int0"), test_size=0.20, random_state=0) data = dict( x_train=X_train, x_test=X_test, y_train=y_train, y_test=y_test ) for cls_name, cls in classifiers.items(): cls_stats[cls_name]['another_time'] += time.time() - tick tick = time.time() # update estimator with examples in the current mini-batch #cls.partial_fit(data['x_train'], data['y_train'], classes=all_classes) #print ("total number of samples for update: ", data['x_train'].shape[0]) #for i in range(0, len(data['x_train'])): #a1 = data['x_train'].iloc[i] #a2 = data['x_train'].iloc[i+1] #b1 = data['y_train'][i] #b2 = data['y_train'][i+1] #a = [a1.as_matrix(columns=None), a2.as_matrix(columns=None)] #b = [b1, b2] #print (a) #print (b) #clf = classifiers[cls_name].fit(a, b) #a = np.dot(cls.coef_ , data['x_train'].iloc[i+1].as_matrix(columns=None)) #print (a) #print ("y for training: ", data['y_train'].shape[0]) if cls_name == 'DBN': data = dataNormalise(data) clf = DBN([data['x_train'].shape[1], 300, 2],learn_rates = 0.3,learn_rate_decays = 0.9,epochs = 10,verbose = 1) clf.fit(data['x_train'], data['y_train']) else: #print (data['x_train']) #print (data['y_train']) clf = classifiers[cls_name].fit(data['x_train'], data['y_train']) #clf = classifiers[cls_name].partial_fit(data['x_train'], data['y_train'], classes=[0,1]) #print ("coefficients") #print (cls.coef_) #print ("test point") #print (data['x_test']) #print (data['x_test'].iloc[1].as_matrix(columns=None)) #print ("dot product x*w") #print (cls.coef_ * data['x_test'].iloc[1].as_matrix(columns=None) ) #print ("dot product1 x*w") # cls.coef_ is the vector with weights of coefficients #print ("total number of samples for testing: ", data['x_test'].shape[0]) #a1 = np.dot(cls.coef_ , data['x_test'].iloc[0].as_matrix(columns=None)) #a2 = np.dot(cls.coef_ , data['x_test'].iloc[1].as_matrix(columns=None)) #x1.append(data['x_test'].iloc[0].as_matrix(columns=None)) #x2.append(data['x_test'].iloc[1].as_matrix(columns=None)) ''' if cls_name == 'SGD': losses1['SGD'].append(a1) losses2['SGD'].append(a2) elif cls_name == 'Perceptron': losses1['Perceptron'].append(a1) losses2['Perceptron'].append(a2) elif cls_name == 'NB Multinomial': losses1['NB'].append(a1) losses2['NB'].append(a2) elif cls_name == 'Passive-Aggressive': losses1['PA'].append(a1) losses2['PA'].append(a2) ''' #print (a) # accumulate statistics #accStats(tick, cls, cls_stats, cls_name, data) accStats(tick, clf, cls_stats, cls_name, data) #print (losses) #csr.to_csv(losses1['SGD'], 'lossesSGDx1.csv') #csr.to_csv(losses2['SGD'], 'lossesSGDx2.csv') #csr.to_csv(losses1['Perceptron'], 'lossesPerceptronx1.csv') #csr.to_csv(losses2['Perceptron'], 'lossesPerceptronx2.csv') #csr.to_csv(losses1['NB'], 'lossesNBx1.csv') #csr.to_csv(losses2['NB'], 'lossesNBx2.csv') #csr.to_csv(losses1['PA'], 'lossesPAx1.csv') #csr.to_csv(losses2['PA'], 'lossesPAx2.csv') #csr.to_csv(x1, 'x1.csv') #csr.to_csv(x1, 'x2.csv') for cls_name, cls in classifiers.items(): stats = [] for iter, point in enumerate(cls_stats[cls_name]['accuracy_history']): stats.append([cls_name, iter, point[0], point[1]]) #print ([cls_name, iter, batch_size, point[0], point[1]]) csr.to_csv(stats, 'online_learning_accuracy.csv')