def clusteringA(clustMeta, dir_c, filenames): #os.mkdir(dir_c, 0777) stats, dfn = clustering(clustMeta, filenames) # match clusters to data print 'Opening a file with the data on the questions' df = pd.read_csv(filenames['input']) print 'Matching the data on the questions with the clusters' matchClusters(dir_c, df, stats, dfn, filenames['out']) #TODO prediction using clusters n dfpca = pd.read_csv(dir_c + 'pca.csv', header=None) #print dfpca.shape #print dfpca[0:12] test = dfpca[0:50] print len(test) n_neighbors = 3 dfstats = pd.read_csv(filenames['stats']) #dfstats = dfstats[dfstats['questions'].str.contains("questions") == False] #print dfstats df = pd.read_csv(filenames['clusters']) neigh = NearestNeighbors(n_neighbors=n_neighbors) neigh.fit(dfstats[['x','y']]) #print test closest = neigh.kneighbors(test) #TODO: dimension mismatching data = cp.calcAccuracy(dfstats, closest, df, n_neighbors, test) csr.to_csv(data, dir_c + 'predictions.csv') for datum in data: print datum
def clusteringA(clustMeta, dir_c, filenames): #os.mkdir(dir_c, 0777) stats, dfn = clustering(clustMeta, filenames) # match clusters to data print 'Opening a file with the data on the questions' df = pd.read_csv(filenames['input']) print 'Matching the data on the questions with the clusters' matchClusters(dir_c, df, stats, dfn, filenames['out']) #TODO prediction using clusters n dfpca = pd.read_csv(dir_c + 'pca.csv', header=None) #print dfpca.shape #print dfpca[0:12] test = dfpca[0:50] print len(test) n_neighbors = 3 dfstats = pd.read_csv(filenames['stats']) #dfstats = dfstats[dfstats['questions'].str.contains("questions") == False] #print dfstats df = pd.read_csv(filenames['clusters']) neigh = NearestNeighbors(n_neighbors=n_neighbors) neigh.fit(dfstats[['x', 'y']]) #print test closest = neigh.kneighbors(test) #TODO: dimension mismatching data = cp.calcAccuracy(dfstats, closest, df, n_neighbors, test) csr.to_csv(data, dir_c + 'predictions.csv') for datum in data: print datum
def clustering(clust, filenames, saved=False): #mergeTitle(df, filename2) if saved: stats = pd.read_csv(filenames['stats']) clusters = pd.read_csv(filenames['clusters']) else: data, results = dp.getDataForClustering(filenames, clust) #TODO divide data into training and testing datasets clust['n_samples'] = len(data) print 'total instances:', clust['n_samples'] testing_num = int(clust['n_samples'] * 0.2) #testing_num = 1924500 results['quest_id'] = results['quest_id'][ testing_num:clust['n_samples']] results['time_row'] = results['time_row'][ testing_num:clust['n_samples']] print 'testing instances: ', str(testing_num) # 385981 print 'Started clustering...' #clusters, stats = clusterData(data, clust, results, False) clusters, stats = clusterData(data[testing_num:clust['n_samples']], clust, results, False) print 'Saving the clustering results...' csr.to_csv1(stats, filenames['stats']) clusters.to_csv(filenames['clusters']) return stats, clusters
def clustering(clust, filenames, saved=False): #mergeTitle(df, filename2) if saved: stats = pd.read_csv(filenames['stats']) clusters = pd.read_csv(filenames['clusters']) else: data, results = dp.getDataForClustering(filenames, clust) #TODO divide data into training and testing datasets clust['n_samples'] = len(data) print 'total instances:', clust['n_samples'] testing_num = int(clust['n_samples'] * 0.2) #testing_num = 1924500 results['quest_id'] = results['quest_id'][testing_num:clust['n_samples']] results['time_row'] = results['time_row'][testing_num:clust['n_samples']] print 'testing instances: ', str(testing_num) # 385981 print 'Started clustering...' #clusters, stats = clusterData(data, clust, results, False) clusters, stats = clusterData(data[testing_num:clust['n_samples']], clust, results, False) print 'Saving the clustering results...' csr.to_csv1(stats, filenames['stats']) clusters.to_csv(filenames['clusters']) return stats, clusters
def runMinibatch(minibatch, cls_stats, classifiers, all_classes, losses1, losses2, x1, x2): for i, (df_small, y_small) in enumerate(minibatch): tick = time.time() #TODO calcualte features for df_small X_train, X_test, y_train, y_test = train_test_split(df_small, y_small.astype("int0"), test_size=0.20, random_state=0) data = dict( x_train=X_train, x_test=X_test, y_train=y_train, y_test=y_test ) for cls_name, cls in classifiers.items(): cls_stats[cls_name]['another_time'] += time.time() - tick tick = time.time() # update estimator with examples in the current mini-batch #cls.partial_fit(data['x_train'], data['y_train'], classes=all_classes) #print ("total number of samples for update: ", data['x_train'].shape[0]) #for i in range(0, len(data['x_train'])): #a1 = data['x_train'].iloc[i] #a2 = data['x_train'].iloc[i+1] #b1 = data['y_train'][i] #b2 = data['y_train'][i+1] #a = [a1.as_matrix(columns=None), a2.as_matrix(columns=None)] #b = [b1, b2] #print (a) #print (b) #clf = classifiers[cls_name].fit(a, b) #a = np.dot(cls.coef_ , data['x_train'].iloc[i+1].as_matrix(columns=None)) #print (a) #print ("y for training: ", data['y_train'].shape[0]) if cls_name == 'DBN': data = dataNormalise(data) clf = DBN([data['x_train'].shape[1], 300, 2],learn_rates = 0.3,learn_rate_decays = 0.9,epochs = 10,verbose = 1) clf.fit(data['x_train'], data['y_train']) else: #print (data['x_train']) #print (data['y_train']) clf = classifiers[cls_name].fit(data['x_train'], data['y_train']) #clf = classifiers[cls_name].partial_fit(data['x_train'], data['y_train'], classes=[0,1]) #print ("coefficients") #print (cls.coef_) #print ("test point") #print (data['x_test']) #print (data['x_test'].iloc[1].as_matrix(columns=None)) #print ("dot product x*w") #print (cls.coef_ * data['x_test'].iloc[1].as_matrix(columns=None) ) #print ("dot product1 x*w") # cls.coef_ is the vector with weights of coefficients #print ("total number of samples for testing: ", data['x_test'].shape[0]) #a1 = np.dot(cls.coef_ , data['x_test'].iloc[0].as_matrix(columns=None)) #a2 = np.dot(cls.coef_ , data['x_test'].iloc[1].as_matrix(columns=None)) #x1.append(data['x_test'].iloc[0].as_matrix(columns=None)) #x2.append(data['x_test'].iloc[1].as_matrix(columns=None)) ''' if cls_name == 'SGD': losses1['SGD'].append(a1) losses2['SGD'].append(a2) elif cls_name == 'Perceptron': losses1['Perceptron'].append(a1) losses2['Perceptron'].append(a2) elif cls_name == 'NB Multinomial': losses1['NB'].append(a1) losses2['NB'].append(a2) elif cls_name == 'Passive-Aggressive': losses1['PA'].append(a1) losses2['PA'].append(a2) ''' #print (a) # accumulate statistics #accStats(tick, cls, cls_stats, cls_name, data) accStats(tick, clf, cls_stats, cls_name, data) #print (losses) #csr.to_csv(losses1['SGD'], 'lossesSGDx1.csv') #csr.to_csv(losses2['SGD'], 'lossesSGDx2.csv') #csr.to_csv(losses1['Perceptron'], 'lossesPerceptronx1.csv') #csr.to_csv(losses2['Perceptron'], 'lossesPerceptronx2.csv') #csr.to_csv(losses1['NB'], 'lossesNBx1.csv') #csr.to_csv(losses2['NB'], 'lossesNBx2.csv') #csr.to_csv(losses1['PA'], 'lossesPAx1.csv') #csr.to_csv(losses2['PA'], 'lossesPAx2.csv') #csr.to_csv(x1, 'x1.csv') #csr.to_csv(x1, 'x2.csv') for cls_name, cls in classifiers.items(): stats = [] for iter, point in enumerate(cls_stats[cls_name]['accuracy_history']): stats.append([cls_name, iter, point[0], point[1]]) #print ([cls_name, iter, batch_size, point[0], point[1]]) csr.to_csv(stats, 'online_learning_accuracy.csv')
def clusterData(data, clust, results, to_plot): plot_sample_size = 6000 if clust['clustering_type'] == 'kmeans': #TODO kmeans works well even on 2.000.000 questions kmeans = KMeans(init='k-means++', n_clusters=clust['n_clusters'], n_init=10) kmeans.fit(data) clust['centers'] = kmeans.cluster_centers_ results['cluster_labels'] = kmeans.labels_ if to_plot: plot.PlotData(data, kmeans, plot_sample_size, clust['exp']) if clust['clustering_type'] == 'spectral': spectral = cluster.SpectralClustering(n_clusters=clust['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") spectral.fit(data) plot.PlotData(data, spectral, plot_sample_size, clust['exp']) if clust['clustering_type'] == 'birch': birch = cluster.Birch(n_clusters=results['n_clusters']) birch.fit(data) results['cluster_labels'] = birch.labels_ print 'number of entries clustered', len(results['cluster_labels']) plot.PlotData(data, birch, plot_sample_size, clust['exp']) if clust['clustering_type'] == 'dbscan': dbscan = cluster.DBSCAN(eps=.2) dbscan.fit(data) results['cluster_labels'] = dbscan.labels_ plot.PlotData(data, dbscan, plot_sample_size, clust['exp']) if clust['clustering_type'] == 'affinity_propagation': affinity_propagation = cluster.AffinityPropagation(damping=.9, preference=-200) affinity_propagation.fit(data) plot.PlotData(data, affinity_propagation, plot_sample_size, clust['exp']) if clust['clustering_type'] == 'ward': # connectivity matrix for structured Ward connectivity = kneighbors_graph(data, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) ward = cluster.AgglomerativeClustering(n_clusters=clust['n_clusters'], linkage='ward', connectivity=connectivity) ward.fit(data) results['cluster_labels'] = ward.labels_ plot.PlotData(data, ward, plot_sample_size, clust['exp']) if clust['clustering_type'] == 'average_linkage': # connectivity matrix for structured Ward connectivity = kneighbors_graph(data, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=clust['n_clusters'], connectivity=connectivity) average_linkage.fit(data) results['cluster_labels'] = average_linkage.labels_ plot.PlotData(data, average_linkage, plot_sample_size, clust['exp']) df = csr.clustDfFromRes(results) stats = csr.clusterResults(df, clust) return df, stats