def clustering(clust, filenames, saved=False): #mergeTitle(df, filename2) if saved: stats = pd.read_csv(filenames['stats']) clusters = pd.read_csv(filenames['clusters']) else: data, results = dp.getDataForClustering(filenames, clust) #TODO divide data into training and testing datasets clust['n_samples'] = len(data) print 'total instances:', clust['n_samples'] testing_num = int(clust['n_samples'] * 0.2) #testing_num = 1924500 results['quest_id'] = results['quest_id'][ testing_num:clust['n_samples']] results['time_row'] = results['time_row'][ testing_num:clust['n_samples']] print 'testing instances: ', str(testing_num) # 385981 print 'Started clustering...' #clusters, stats = clusterData(data, clust, results, False) clusters, stats = clusterData(data[testing_num:clust['n_samples']], clust, results, False) print 'Saving the clustering results...' csr.to_csv1(stats, filenames['stats']) clusters.to_csv(filenames['clusters']) return stats, clusters
def clustering(clust, filenames, saved=False): #mergeTitle(df, filename2) if saved: stats = pd.read_csv(filenames['stats']) clusters = pd.read_csv(filenames['clusters']) else: data, results = dp.getDataForClustering(filenames, clust) #TODO divide data into training and testing datasets clust['n_samples'] = len(data) print 'total instances:', clust['n_samples'] testing_num = int(clust['n_samples'] * 0.2) #testing_num = 1924500 results['quest_id'] = results['quest_id'][testing_num:clust['n_samples']] results['time_row'] = results['time_row'][testing_num:clust['n_samples']] print 'testing instances: ', str(testing_num) # 385981 print 'Started clustering...' #clusters, stats = clusterData(data, clust, results, False) clusters, stats = clusterData(data[testing_num:clust['n_samples']], clust, results, False) print 'Saving the clustering results...' csr.to_csv1(stats, filenames['stats']) clusters.to_csv(filenames['clusters']) return stats, clusters