Exemple #1
0
def clusteringA(clustMeta, dir_c, filenames):
   #os.mkdir(dir_c, 0777)

   stats, dfn = clustering(clustMeta, filenames)
   # match clusters to data
   print 'Opening a file with the data on the questions'
   df = pd.read_csv(filenames['input'])
   print 'Matching the data on the questions with the clusters'
   matchClusters(dir_c, df, stats, dfn, filenames['out'])

   #TODO prediction using clusters n
   dfpca = pd.read_csv(dir_c + 'pca.csv', header=None)
   #print dfpca.shape
   #print dfpca[0:12]
   test = dfpca[0:50]
   print len(test)
   n_neighbors = 3
   dfstats = pd.read_csv(filenames['stats'])
   #dfstats = dfstats[dfstats['questions'].str.contains("questions") == False]
   #print dfstats
   df = pd.read_csv(filenames['clusters'])
   neigh = NearestNeighbors(n_neighbors=n_neighbors)
   neigh.fit(dfstats[['x','y']])
   #print test
   closest = neigh.kneighbors(test) #TODO: dimension mismatching
   data = cp.calcAccuracy(dfstats, closest, df, n_neighbors, test)
   csr.to_csv(data, dir_c + 'predictions.csv')
   for datum in data:
      print datum
Exemple #2
0
def clusteringA(clustMeta, dir_c, filenames):
    #os.mkdir(dir_c, 0777)

    stats, dfn = clustering(clustMeta, filenames)
    # match clusters to data
    print 'Opening a file with the data on the questions'
    df = pd.read_csv(filenames['input'])
    print 'Matching the data on the questions with the clusters'
    matchClusters(dir_c, df, stats, dfn, filenames['out'])

    #TODO prediction using clusters n
    dfpca = pd.read_csv(dir_c + 'pca.csv', header=None)
    #print dfpca.shape
    #print dfpca[0:12]
    test = dfpca[0:50]
    print len(test)
    n_neighbors = 3
    dfstats = pd.read_csv(filenames['stats'])
    #dfstats = dfstats[dfstats['questions'].str.contains("questions") == False]
    #print dfstats
    df = pd.read_csv(filenames['clusters'])
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    neigh.fit(dfstats[['x', 'y']])
    #print test
    closest = neigh.kneighbors(test)  #TODO: dimension mismatching
    data = cp.calcAccuracy(dfstats, closest, df, n_neighbors, test)
    csr.to_csv(data, dir_c + 'predictions.csv')
    for datum in data:
        print datum
Exemple #3
0
def runMinibatch(minibatch, cls_stats, classifiers, all_classes, losses1, losses2, x1, x2):

    for i, (df_small, y_small) in enumerate(minibatch):
        tick = time.time()
        #TODO calcualte features for df_small
        X_train, X_test, y_train, y_test = train_test_split(df_small, y_small.astype("int0"), test_size=0.20, random_state=0)
        data = dict(
            x_train=X_train,
            x_test=X_test,
            y_train=y_train,
            y_test=y_test
        )
        for cls_name, cls in classifiers.items():
            cls_stats[cls_name]['another_time'] += time.time() - tick
            tick = time.time()
            # update estimator with examples in the current mini-batch
            #cls.partial_fit(data['x_train'], data['y_train'], classes=all_classes)
            #print ("total number of samples for update: ", data['x_train'].shape[0])
            #for i in range(0, len(data['x_train'])):
                #a1 = data['x_train'].iloc[i]
                #a2 = data['x_train'].iloc[i+1]
                #b1 = data['y_train'][i]
                #b2 = data['y_train'][i+1]
                #a = [a1.as_matrix(columns=None), a2.as_matrix(columns=None)]
                #b = [b1, b2]
                #print (a)
                #print (b)
                #clf = classifiers[cls_name].fit(a, b)
                #a = np.dot(cls.coef_ , data['x_train'].iloc[i+1].as_matrix(columns=None))
                #print (a)
            #print ("y for training: ", data['y_train'].shape[0])
            if cls_name == 'DBN':
                data = dataNormalise(data)
                clf = DBN([data['x_train'].shape[1], 300, 2],learn_rates = 0.3,learn_rate_decays = 0.9,epochs = 10,verbose = 1)
                clf.fit(data['x_train'], data['y_train'])
            else:
                #print (data['x_train'])
                #print (data['y_train'])
                clf = classifiers[cls_name].fit(data['x_train'], data['y_train'])
                #clf = classifiers[cls_name].partial_fit(data['x_train'], data['y_train'], classes=[0,1])
            #print ("coefficients")
            #print (cls.coef_)
            #print ("test point")
            #print (data['x_test'])
            #print (data['x_test'].iloc[1].as_matrix(columns=None))
            #print ("dot product x*w")
            #print (cls.coef_ * data['x_test'].iloc[1].as_matrix(columns=None) )
            #print ("dot product1 x*w")
            # cls.coef_ is the vector with weights of coefficients
            #print ("total number of samples for testing: ", data['x_test'].shape[0])
            #a1 = np.dot(cls.coef_ , data['x_test'].iloc[0].as_matrix(columns=None))
            #a2 = np.dot(cls.coef_ , data['x_test'].iloc[1].as_matrix(columns=None))
            #x1.append(data['x_test'].iloc[0].as_matrix(columns=None))
            #x2.append(data['x_test'].iloc[1].as_matrix(columns=None))
            '''
            if cls_name == 'SGD':
                losses1['SGD'].append(a1)
                losses2['SGD'].append(a2)
            elif cls_name == 'Perceptron':
                losses1['Perceptron'].append(a1)
                losses2['Perceptron'].append(a2)
            elif cls_name == 'NB Multinomial':
                losses1['NB'].append(a1)
                losses2['NB'].append(a2)
            elif cls_name == 'Passive-Aggressive':
                losses1['PA'].append(a1)
                losses2['PA'].append(a2)
            '''
            #print (a)
            # accumulate statistics
            #accStats(tick, cls, cls_stats, cls_name, data)
            accStats(tick, clf, cls_stats, cls_name, data)
    #print (losses)
    #csr.to_csv(losses1['SGD'], 'lossesSGDx1.csv')
    #csr.to_csv(losses2['SGD'], 'lossesSGDx2.csv')
    #csr.to_csv(losses1['Perceptron'], 'lossesPerceptronx1.csv')
    #csr.to_csv(losses2['Perceptron'], 'lossesPerceptronx2.csv')
    #csr.to_csv(losses1['NB'], 'lossesNBx1.csv')
    #csr.to_csv(losses2['NB'], 'lossesNBx2.csv')
    #csr.to_csv(losses1['PA'], 'lossesPAx1.csv')
    #csr.to_csv(losses2['PA'], 'lossesPAx2.csv')
    #csr.to_csv(x1, 'x1.csv')
    #csr.to_csv(x1, 'x2.csv')

    for cls_name, cls in classifiers.items():
        stats = []
        for iter, point in enumerate(cls_stats[cls_name]['accuracy_history']):
            stats.append([cls_name, iter, point[0], point[1]])
            #print ([cls_name, iter, batch_size, point[0], point[1]])
        csr.to_csv(stats, 'online_learning_accuracy.csv')