Esempio n. 1
0
def clustering(clust, filenames, saved=False):
    #mergeTitle(df, filename2)
    if saved:
        stats = pd.read_csv(filenames['stats'])
        clusters = pd.read_csv(filenames['clusters'])
    else:
        data, results = dp.getDataForClustering(filenames, clust)
        #TODO divide data into training and testing datasets
        clust['n_samples'] = len(data)
        print 'total instances:', clust['n_samples']
        testing_num = int(clust['n_samples'] * 0.2)
        #testing_num = 1924500
        results['quest_id'] = results['quest_id'][
            testing_num:clust['n_samples']]
        results['time_row'] = results['time_row'][
            testing_num:clust['n_samples']]
        print 'testing instances: ', str(testing_num)  # 385981
        print 'Started clustering...'
        #clusters, stats = clusterData(data, clust, results, False)
        clusters, stats = clusterData(data[testing_num:clust['n_samples']],
                                      clust, results, False)
        print 'Saving the clustering results...'
        csr.to_csv1(stats, filenames['stats'])
        clusters.to_csv(filenames['clusters'])
    return stats, clusters
Esempio n. 2
0
def clustering(clust, filenames, saved=False):
    #mergeTitle(df, filename2)
    if saved:
        stats = pd.read_csv(filenames['stats'])
        clusters = pd.read_csv(filenames['clusters'])
    else:
        data, results = dp.getDataForClustering(filenames, clust)
        #TODO divide data into training and testing datasets
        clust['n_samples'] = len(data)
        print 'total instances:', clust['n_samples']
        testing_num = int(clust['n_samples'] * 0.2)
        #testing_num = 1924500
        results['quest_id'] = results['quest_id'][testing_num:clust['n_samples']]
        results['time_row'] = results['time_row'][testing_num:clust['n_samples']]
        print 'testing instances: ', str(testing_num) # 385981
        print 'Started clustering...'
        #clusters, stats = clusterData(data, clust, results, False)
        clusters, stats = clusterData(data[testing_num:clust['n_samples']], clust, results, False)
        print 'Saving the clustering results...'
        csr.to_csv1(stats, filenames['stats'])
        clusters.to_csv(filenames['clusters'])
    return stats, clusters