Beispiel #1
0
def clusteringA(clustMeta, dir_c, filenames):
   #os.mkdir(dir_c, 0777)

   stats, dfn = clustering(clustMeta, filenames)
   # match clusters to data
   print 'Opening a file with the data on the questions'
   df = pd.read_csv(filenames['input'])
   print 'Matching the data on the questions with the clusters'
   matchClusters(dir_c, df, stats, dfn, filenames['out'])

   #TODO prediction using clusters n
   dfpca = pd.read_csv(dir_c + 'pca.csv', header=None)
   #print dfpca.shape
   #print dfpca[0:12]
   test = dfpca[0:50]
   print len(test)
   n_neighbors = 3
   dfstats = pd.read_csv(filenames['stats'])
   #dfstats = dfstats[dfstats['questions'].str.contains("questions") == False]
   #print dfstats
   df = pd.read_csv(filenames['clusters'])
   neigh = NearestNeighbors(n_neighbors=n_neighbors)
   neigh.fit(dfstats[['x','y']])
   #print test
   closest = neigh.kneighbors(test) #TODO: dimension mismatching
   data = cp.calcAccuracy(dfstats, closest, df, n_neighbors, test)
   csr.to_csv(data, dir_c + 'predictions.csv')
   for datum in data:
      print datum
Beispiel #2
0
def clusteringA(clustMeta, dir_c, filenames):
    #os.mkdir(dir_c, 0777)

    stats, dfn = clustering(clustMeta, filenames)
    # match clusters to data
    print 'Opening a file with the data on the questions'
    df = pd.read_csv(filenames['input'])
    print 'Matching the data on the questions with the clusters'
    matchClusters(dir_c, df, stats, dfn, filenames['out'])

    #TODO prediction using clusters n
    dfpca = pd.read_csv(dir_c + 'pca.csv', header=None)
    #print dfpca.shape
    #print dfpca[0:12]
    test = dfpca[0:50]
    print len(test)
    n_neighbors = 3
    dfstats = pd.read_csv(filenames['stats'])
    #dfstats = dfstats[dfstats['questions'].str.contains("questions") == False]
    #print dfstats
    df = pd.read_csv(filenames['clusters'])
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    neigh.fit(dfstats[['x', 'y']])
    #print test
    closest = neigh.kneighbors(test)  #TODO: dimension mismatching
    data = cp.calcAccuracy(dfstats, closest, df, n_neighbors, test)
    csr.to_csv(data, dir_c + 'predictions.csv')
    for datum in data:
        print datum
Beispiel #3
0
def clustering(clust, filenames, saved=False):
    #mergeTitle(df, filename2)
    if saved:
        stats = pd.read_csv(filenames['stats'])
        clusters = pd.read_csv(filenames['clusters'])
    else:
        data, results = dp.getDataForClustering(filenames, clust)
        #TODO divide data into training and testing datasets
        clust['n_samples'] = len(data)
        print 'total instances:', clust['n_samples']
        testing_num = int(clust['n_samples'] * 0.2)
        #testing_num = 1924500
        results['quest_id'] = results['quest_id'][
            testing_num:clust['n_samples']]
        results['time_row'] = results['time_row'][
            testing_num:clust['n_samples']]
        print 'testing instances: ', str(testing_num)  # 385981
        print 'Started clustering...'
        #clusters, stats = clusterData(data, clust, results, False)
        clusters, stats = clusterData(data[testing_num:clust['n_samples']],
                                      clust, results, False)
        print 'Saving the clustering results...'
        csr.to_csv1(stats, filenames['stats'])
        clusters.to_csv(filenames['clusters'])
    return stats, clusters
Beispiel #4
0
def clustering(clust, filenames, saved=False):
    #mergeTitle(df, filename2)
    if saved:
        stats = pd.read_csv(filenames['stats'])
        clusters = pd.read_csv(filenames['clusters'])
    else:
        data, results = dp.getDataForClustering(filenames, clust)
        #TODO divide data into training and testing datasets
        clust['n_samples'] = len(data)
        print 'total instances:', clust['n_samples']
        testing_num = int(clust['n_samples'] * 0.2)
        #testing_num = 1924500
        results['quest_id'] = results['quest_id'][testing_num:clust['n_samples']]
        results['time_row'] = results['time_row'][testing_num:clust['n_samples']]
        print 'testing instances: ', str(testing_num) # 385981
        print 'Started clustering...'
        #clusters, stats = clusterData(data, clust, results, False)
        clusters, stats = clusterData(data[testing_num:clust['n_samples']], clust, results, False)
        print 'Saving the clustering results...'
        csr.to_csv1(stats, filenames['stats'])
        clusters.to_csv(filenames['clusters'])
    return stats, clusters
Beispiel #5
0
def runMinibatch(minibatch, cls_stats, classifiers, all_classes, losses1, losses2, x1, x2):

    for i, (df_small, y_small) in enumerate(minibatch):
        tick = time.time()
        #TODO calcualte features for df_small
        X_train, X_test, y_train, y_test = train_test_split(df_small, y_small.astype("int0"), test_size=0.20, random_state=0)
        data = dict(
            x_train=X_train,
            x_test=X_test,
            y_train=y_train,
            y_test=y_test
        )
        for cls_name, cls in classifiers.items():
            cls_stats[cls_name]['another_time'] += time.time() - tick
            tick = time.time()
            # update estimator with examples in the current mini-batch
            #cls.partial_fit(data['x_train'], data['y_train'], classes=all_classes)
            #print ("total number of samples for update: ", data['x_train'].shape[0])
            #for i in range(0, len(data['x_train'])):
                #a1 = data['x_train'].iloc[i]
                #a2 = data['x_train'].iloc[i+1]
                #b1 = data['y_train'][i]
                #b2 = data['y_train'][i+1]
                #a = [a1.as_matrix(columns=None), a2.as_matrix(columns=None)]
                #b = [b1, b2]
                #print (a)
                #print (b)
                #clf = classifiers[cls_name].fit(a, b)
                #a = np.dot(cls.coef_ , data['x_train'].iloc[i+1].as_matrix(columns=None))
                #print (a)
            #print ("y for training: ", data['y_train'].shape[0])
            if cls_name == 'DBN':
                data = dataNormalise(data)
                clf = DBN([data['x_train'].shape[1], 300, 2],learn_rates = 0.3,learn_rate_decays = 0.9,epochs = 10,verbose = 1)
                clf.fit(data['x_train'], data['y_train'])
            else:
                #print (data['x_train'])
                #print (data['y_train'])
                clf = classifiers[cls_name].fit(data['x_train'], data['y_train'])
                #clf = classifiers[cls_name].partial_fit(data['x_train'], data['y_train'], classes=[0,1])
            #print ("coefficients")
            #print (cls.coef_)
            #print ("test point")
            #print (data['x_test'])
            #print (data['x_test'].iloc[1].as_matrix(columns=None))
            #print ("dot product x*w")
            #print (cls.coef_ * data['x_test'].iloc[1].as_matrix(columns=None) )
            #print ("dot product1 x*w")
            # cls.coef_ is the vector with weights of coefficients
            #print ("total number of samples for testing: ", data['x_test'].shape[0])
            #a1 = np.dot(cls.coef_ , data['x_test'].iloc[0].as_matrix(columns=None))
            #a2 = np.dot(cls.coef_ , data['x_test'].iloc[1].as_matrix(columns=None))
            #x1.append(data['x_test'].iloc[0].as_matrix(columns=None))
            #x2.append(data['x_test'].iloc[1].as_matrix(columns=None))
            '''
            if cls_name == 'SGD':
                losses1['SGD'].append(a1)
                losses2['SGD'].append(a2)
            elif cls_name == 'Perceptron':
                losses1['Perceptron'].append(a1)
                losses2['Perceptron'].append(a2)
            elif cls_name == 'NB Multinomial':
                losses1['NB'].append(a1)
                losses2['NB'].append(a2)
            elif cls_name == 'Passive-Aggressive':
                losses1['PA'].append(a1)
                losses2['PA'].append(a2)
            '''
            #print (a)
            # accumulate statistics
            #accStats(tick, cls, cls_stats, cls_name, data)
            accStats(tick, clf, cls_stats, cls_name, data)
    #print (losses)
    #csr.to_csv(losses1['SGD'], 'lossesSGDx1.csv')
    #csr.to_csv(losses2['SGD'], 'lossesSGDx2.csv')
    #csr.to_csv(losses1['Perceptron'], 'lossesPerceptronx1.csv')
    #csr.to_csv(losses2['Perceptron'], 'lossesPerceptronx2.csv')
    #csr.to_csv(losses1['NB'], 'lossesNBx1.csv')
    #csr.to_csv(losses2['NB'], 'lossesNBx2.csv')
    #csr.to_csv(losses1['PA'], 'lossesPAx1.csv')
    #csr.to_csv(losses2['PA'], 'lossesPAx2.csv')
    #csr.to_csv(x1, 'x1.csv')
    #csr.to_csv(x1, 'x2.csv')

    for cls_name, cls in classifiers.items():
        stats = []
        for iter, point in enumerate(cls_stats[cls_name]['accuracy_history']):
            stats.append([cls_name, iter, point[0], point[1]])
            #print ([cls_name, iter, batch_size, point[0], point[1]])
        csr.to_csv(stats, 'online_learning_accuracy.csv')
Beispiel #6
0
def clusterData(data, clust, results, to_plot):
    plot_sample_size = 6000
    if clust['clustering_type'] == 'kmeans':
        #TODO kmeans works well even on 2.000.000 questions
        kmeans = KMeans(init='k-means++', n_clusters=clust['n_clusters'], n_init=10)
        kmeans.fit(data)
        clust['centers'] = kmeans.cluster_centers_
        results['cluster_labels'] = kmeans.labels_
        if to_plot:
            plot.PlotData(data, kmeans, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'spectral':
        spectral = cluster.SpectralClustering(n_clusters=clust['n_clusters'],
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
        spectral.fit(data)
        plot.PlotData(data, spectral, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'birch':
        birch = cluster.Birch(n_clusters=results['n_clusters'])
        birch.fit(data)
        results['cluster_labels'] = birch.labels_
        print 'number of entries clustered', len(results['cluster_labels'])
        plot.PlotData(data, birch, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'dbscan':
        dbscan = cluster.DBSCAN(eps=.2)
        dbscan.fit(data)
        results['cluster_labels'] = dbscan.labels_
        plot.PlotData(data, dbscan, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'affinity_propagation':
        affinity_propagation = cluster.AffinityPropagation(damping=.9, preference=-200)
        affinity_propagation.fit(data)
        plot.PlotData(data, affinity_propagation, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'ward':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(data, n_neighbors=10, include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        ward = cluster.AgglomerativeClustering(n_clusters=clust['n_clusters'], linkage='ward',
                                           connectivity=connectivity)
        ward.fit(data)
        results['cluster_labels'] = ward.labels_
        plot.PlotData(data, ward, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'average_linkage':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(data, n_neighbors=10, include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        average_linkage = cluster.AgglomerativeClustering(
        linkage="average", affinity="cityblock", n_clusters=clust['n_clusters'],
        connectivity=connectivity)
        average_linkage.fit(data)
        results['cluster_labels'] = average_linkage.labels_
        plot.PlotData(data, average_linkage, plot_sample_size, clust['exp'])
    df = csr.clustDfFromRes(results)
    stats = csr.clusterResults(df, clust)
    return df, stats
Beispiel #7
0
def clusterData(data, clust, results, to_plot):
    plot_sample_size = 6000
    if clust['clustering_type'] == 'kmeans':
        #TODO kmeans works well even on 2.000.000 questions
        kmeans = KMeans(init='k-means++',
                        n_clusters=clust['n_clusters'],
                        n_init=10)
        kmeans.fit(data)
        clust['centers'] = kmeans.cluster_centers_
        results['cluster_labels'] = kmeans.labels_
        if to_plot:
            plot.PlotData(data, kmeans, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'spectral':
        spectral = cluster.SpectralClustering(n_clusters=clust['n_clusters'],
                                              eigen_solver='arpack',
                                              affinity="nearest_neighbors")
        spectral.fit(data)
        plot.PlotData(data, spectral, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'birch':
        birch = cluster.Birch(n_clusters=results['n_clusters'])
        birch.fit(data)
        results['cluster_labels'] = birch.labels_
        print 'number of entries clustered', len(results['cluster_labels'])
        plot.PlotData(data, birch, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'dbscan':
        dbscan = cluster.DBSCAN(eps=.2)
        dbscan.fit(data)
        results['cluster_labels'] = dbscan.labels_
        plot.PlotData(data, dbscan, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'affinity_propagation':
        affinity_propagation = cluster.AffinityPropagation(damping=.9,
                                                           preference=-200)
        affinity_propagation.fit(data)
        plot.PlotData(data, affinity_propagation, plot_sample_size,
                      clust['exp'])

    if clust['clustering_type'] == 'ward':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(data,
                                        n_neighbors=10,
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        ward = cluster.AgglomerativeClustering(n_clusters=clust['n_clusters'],
                                               linkage='ward',
                                               connectivity=connectivity)
        ward.fit(data)
        results['cluster_labels'] = ward.labels_
        plot.PlotData(data, ward, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'average_linkage':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(data,
                                        n_neighbors=10,
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        average_linkage = cluster.AgglomerativeClustering(
            linkage="average",
            affinity="cityblock",
            n_clusters=clust['n_clusters'],
            connectivity=connectivity)
        average_linkage.fit(data)
        results['cluster_labels'] = average_linkage.labels_
        plot.PlotData(data, average_linkage, plot_sample_size, clust['exp'])
    df = csr.clustDfFromRes(results)
    stats = csr.clusterResults(df, clust)
    return df, stats