def ChiTreeFlux(dataFlux,dDataFlux,modelFlux):
    '''
    Finds the model that minimizes chi^2 distance for each object in flux space
    using a ball_tree search with seuclidean metric (weighting each dimension by 
    the variance in that flux)
    
    Inputs:
            dataFlux = observed fluxes, array of size (#objects,#filters)
            dDataFlux = flux uncertainties, array of size (#objects,#filters)
            modelFlux = fluxes of models, array of size (#models,#filters)
            
    Output:
            NumPy array of size (#objects,3)
            Columns [index of model with min chi^2, scale factor, chi^2 value]
    '''
    results = np.array([]).reshape(0,3)
    for i in range(len(dataFlux)):
        scales = fit_tools.Scale(modelFlux,dataFlux[i],dDataFlux[i])
        scaledModelFlux = (modelFlux.transpose() * scales.transpose()).transpose()
        tree = nn(n_neighbors=1,algorithm='ball_tree',metric='seuclidean',metric_params={'V':dDataFlux[i]**2})
        tree.fit(scaledModelFlux)
        query = tree.kneighbors(dataFlux[i],1)
        n,chi2 = query[1][0][0],query[0][0][0]**2.
        s = scales[int(n)]
        results = np.r_[results,[[n,s,chi2]]]
    return(results)
Exemple #2
0
def ChiTree(dataFlux, dDataFlux, modelFlux):
    """
    Finds the model that minimizes chi^2 distance for each object using a ball_tree search
    with seuclidean metric (weighting each dimension by the variance in that flux)
    Inputs:
            dataFlux = observed fluxes, array of size (#objects,#filters)
            dDataFlux = flux uncertainties, array of size (#objects,#filters)
            modelFlux = fluxes of models, array of size (#models,#filters)
            
    Output:
            NumPy array of size (#objects,3)
            Columns [index of model with min chi^2, scale factor, chi^2 value]
    """
    modelColors = modelFlux[:, 1:] / modelFlux[:, :-1]
    dataColors = dataFluxes[:, 1:] / dataFluxes[:, :-1]
    dDataColors = np.sqrt(
        (1.0 / dataFluxes[:, :-1]) ** 2 * (dDataFluxes[:, 1:]) ** 2
        + (dataFluxes[:, 1:] / dataFluxes[:, :-1] ** 2) ** 2 * (dDataFluxes[:, :-1]) ** 2
    )
    results = np.array([]).reshape(0, 3)
    for i in range(len(dataFlux)):
        tree = nn(n_neighbors=1, algorithm="ball_tree", metric="seuclidean", metric_params={"V": dDataColors[i] ** 2})
        tree.fit(modelColors)
        query = tree.kneighbors(dataColors[i], 1)
        n, chi2 = query[1][0][0], query[0][0][0] ** 2.0
        s = Scale(modelFlux[n], dataFlux[i], dDataFlux[i])
        results = np.r_[results, [[n, s, chi2]]]
    return results
Exemple #3
0
def ChiTree(dataFlux, dDataFlux, modelFlux):
    '''
    Finds the model that minimizes chi^2 distance for each object using a ball_tree search
    with seuclidean metric (weighting each dimension by the variance in that flux)
    Inputs:
            dataFlux = observed fluxes, array of size (#objects,#filters)
            dDataFlux = flux uncertainties, array of size (#objects,#filters)
            modelFlux = fluxes of models, array of size (#models,#filters)
            
    Output:
            NumPy array of size (#objects,3)
            Columns [index of model with min chi^2, scale factor, chi^2 value]
    '''
    modelColors = modelFlux[:, 1:] / modelFlux[:, :-1]
    dataColors = dataFluxes[:, 1:] / dataFluxes[:, :-1]
    dDataColors = np.sqrt( (1./dataFluxes[:,:-1])**2 * (dDataFluxes[:,1:])**2 \
                + (dataFluxes[:,1:]/dataFluxes[:,:-1]**2)**2 * (dDataFluxes[:,:-1])**2)
    results = np.array([]).reshape(0, 3)
    for i in range(len(dataFlux)):
        tree = nn(n_neighbors=1,
                  algorithm='ball_tree',
                  metric='seuclidean',
                  metric_params={'V': dDataColors[i]**2})
        tree.fit(modelColors)
        query = tree.kneighbors(dataColors[i], 1)
        n, chi2 = query[1][0][0], query[0][0][0]**2.
        s = Scale(modelFlux[n], dataFlux[i], dDataFlux[i])
        results = np.r_[results, [[n, s, chi2]]]
    return (results)
Exemple #4
0
def train_model(data, k=K):
    start = time.time()
    model = nn()
    model.set_params(n_neighbors=k, algorithm='auto', metric='cosine')
    model.fit(data)
    end = time.time()
    print("training took {}".format(end - start))
    return model
Exemple #5
0
def fuse_densities_direct(vec_A, vec_B, m, k, n_particles):
    # based on Hanebeck paper

    w = np.zeros(n_particles)
    nbrs_a = nn(n_neighbors=m).fit(vec_B)
    nbrs_b = nn(n_neighbors=k + 1).fit(
        vec_B)  # +1 because self is also counted
    dists_a, inds_a = nbrs_a.kneighbors(vec_A)  # m closest in B for A
    dists_b, inds_b = nbrs_b.kneighbors(vec_B)  # k closest in B for itself

    for i in range(n_particles):
        K_b = 0
        for j in inds_a[i]:
            farthest = vec_B[inds_b[j, -1]]
            # V = 8.0 * np.pi**2 * norm(farthest - vec_B[j, 2:])**5 / 15.0
            # f_b = k / V
            # tau = (V / k)**(1.0/5.0) / np.sqrt(2*np.pi)
            V = 8.0 * np.pi**2 * norm(farthest - vec_B[j])**5 / 15.0
            f_b = k / V
            tau = (V / k)**(1.0 / 5.0) / np.sqrt(2 * np.pi)
            K_b += f_b * np.exp(-0.5 * np.dot(vec_A[i] - vec_B[j],
                                              vec_A[i] - vec_B[j]) / tau**2)
        w[i] = K_b
    w /= np.sum(w)

    # n = 4
    # bins = np.linspace(np.amin(vec_B[:, n]), np.amax(vec_B[:, n]), 20)
    # plt.hist(vec_B[:, n], bins, alpha=0.5)
    # bins = np.linspace(np.amin(vec_A[:, n]), np.amax(vec_A[:, n]), 20)
    # plt.hist(vec_A[:, n], bins, alpha=0.5)#, weights=w)

    # plt.scatter(vec_A[:, 4], vec_A[:, 3], c='blue', alpha=0.2)
    # plt.scatter(vec_B[:, 4], vec_B[:, 3], c='red', alpha=0.2)

    vec_res = np.sum(w[:, None] * vec_A[:], axis=0)
    # plt.plot(vec_res[n-2], 100, 'o')
    # plt.show()
    # plt.scatter(vec_res[2], vec_res[1], c='green')
    # plt.show()
    m_res = vec_res[:2]
    Res = np.array([[vec_res[2], vec_res[3]], [vec_res[3], vec_res[4]]])
    Res = np.dot(Res, Res)

    return m_res, Res
Exemple #6
0
    def compute_within_class_matrix_whitening(self):

        for x in np.unique(self.train.labels):
            who_cl = np.where(self.train.labels == x)[0]
            self.data_intra = self.train.mat[:, who_cl]
            knn = nn().fit(self.data_intra.transpose())
            self.dintra, self.indintra = knn.kneighbors(
                self.data_intra.transpose())
            self.dintra = self.dintra.transpose()
            self.indintra = self.indintra.transpose()
            self.dintra[self.K, :] = []
            self.indintra[self.K, :] = []
            self.valIn[:, who_cl] = self.dintra[1, :]
            self.indIn[:, who_cl] = who_cl[self.indintra[1, :]]
Exemple #7
0
def _knn(t, min_freq, save=False):
    if save:
        nbrs = nn()
        parameters = {
            'n_neighbors': [1],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
        }
        clf = GridSearchCV(nbrs, parameters, cv=5)
        clf.fit(records, labels)
        save_classifier(clf.best_estimator_, t, 'knn', min_freq)
        return ('knn', clf.best_estimator_)
    else:
        clf = load_classifier(t, 'knn', min_freq)
        return ('knn', clf)
Exemple #8
0
def simulate(X, num_iters=10, n_neighbors=10, **merpCfg):
    # normalize input data
    X = normalization(X)

    # the merp instance
    rp = Merp(merpCfg)

    # split train and test set
    X_train, X_test = train_test_split(X,
                                       test_size=0.05,
                                       train_size=0.95,
                                       random_state=23)

    # generate ground truth
    true_neigh = nn(n_neighbors=10)
    true_neigh.fit(X_train, return_distance=False)

    recall = []
    for i in range(num_iters):
        rp.regenerate_omega()
        X_train_rp = rp.transform(X_train)
        X_test_rp = rp.transform(X_test)

        # generate predictions
        rp_neigh = nn(n_neighbors=n_neighbors)
        rp_neigh.fit(X_train_rp)

        # query and calculate recall rate
        true_neighbors = true_neigh.kneighbors(X_test)
        pred_neighbors = rp_neigh.kneighbors(X_test_rp)
        curr_recall = np.asarray([
            np.intersect1d(true_neighbors[i], pred_neighbors[i]).size
            for i in range(X_test.shape[0])
        ]) / n_neighbors
        recall.append(curr_recall.mean())
    return recall
Exemple #9
0
    def compute_bet_class_cluster_dist(self):

        for x in np.unique(self.train.labels):

            who_cl = np.where(self.train.labels == x)[0]
            who_notcl = np.where((self.train.labels != x))[0]
            self.data_intra = self.Wtr[:, who_cl]
            self.data_extra = self.Wtr[:, who_notcl]

            knn = nn().fit(self.data_extra.transpose())
            self.dextra, self.indextra = knn.kneighbors(
                self.data_intra.transpose())
            self.dextra = self.dextra.transpose()
            self.indextra = self.indextra.transpose()
            self.indEx[:, who_cl] = who_notcl[self.indextra[1, :]]
            self.valEx[:, who_cl] = self.dextra[1, :]
Exemple #10
0
def SpectralClustering(data, num_clusters=2, affinity='rbf', gamma=1.0, num_neighbors=1):
    if(affinity == 'rbf'):
        sim_matrix = rbf(data,data,gamma)
    elif(affinity == 'knn'):
        nearest_neigbhor = nn(n_neighbors=num_neighbors)
        nearest_neigbhor.fit(data)
        sim_matrix = nearest_neigbhor.kneighbors_graph(data, mode='connectivity').toarray()
        
    deg_matrix = np.diag(np.sum(sim_matrix, axis=1))
    laplace_matrix = deg_matrix - sim_matrix
    asym_laplace_matrix = np.dot(np.linalg.inv(deg_matrix),laplace_matrix) 
    eig_values,eig_vectors = np.linalg.eig(asym_laplace_matrix)
    idx = np.real(eig_values).argsort()[:num_clusters]
    eig_vectors = np.real(eig_vectors[:,idx])
    rows_norm = np.linalg.norm(eig_vectors, axis=1)
    normalized_eig_vectors = (eig_vectors.T / rows_norm).T
    centroids,clusters,error,iter_num = Kmeans(normalized_eig_vectors, num_clusters=num_clusters)
    return normalized_eig_vectors,centroids,clusters
Exemple #11
0
def genPlaylist():
    #read in dataframes
    df = pd.read_csv("spotify_data/cleaned_df.csv")
    df.drop("Unnamed: 0", axis=1, inplace=True)
    sdf = pd.read_csv("spotify_data/scaled_song_data.csv")
    sdf.drop("Unnamed: 0", axis=1, inplace=True)
    #read in knn model from joblib
    # nghbr = joblib.load("spotify_data/knn_jblib.pkl")
    #read user inputted song + selected features
    feats = request.form['features']
    #clean feature array from json
    feats = feats.strip('[').strip(']').split(',')
    feats = [e.strip('"') for e in feats]
    id = feats[-1]
    feats = feats[:-1]
    # get audio features from scaled csv
    audioVals = sdf[sdf['id'] == id][feats]
    #make custom training data for each features
    custom_df = sdf[feats]
    #build specific knn using user selected features
    neighbor = nn(n_neighbors=10,
                  algorithm='kd_tree',
                  metric='euclidean',
                  n_jobs=-1)
    neighbor.fit(custom_df)
    distances, indicies = neighbor.kneighbors(X=audioVals, n_neighbors=10)
    #store the generated playlist in a dataframe
    rdf = pd.DataFrame(columns=df.columns)
    for i in indicies[0]:
        rdf = rdf.append(df.iloc[i], ignore_index=True)
    rdf['distance'] = pd.Series(distances[0].round(3), index=rdf.index)
    rdf.drop(['explicit', 'release_date', 'duration_ms'], axis=1, inplace=True)
    #store as JSON object
    rdf = rdf.to_json()

    return rdf
Exemple #12
0
def get_recommendations(data, target):
    print('------------------ Processing ------------------')
    knn_model = nn(n_neighbors=10, metric='cosine')
    knn_model.fit(data)
    k_neighbors = knn_model.kneighbors(np.asarray(target).reshape(1, -1))
    return k_neighbors[1][0]
Exemple #13
0
def fuse_densities_direct_sym(vec_A, vec_B, m, k, n_particles):
    # based on Hanebeck paper

    w = np.zeros(n_particles * 2)
    nbrs_ba = nn(n_neighbors=m).fit(vec_B)
    nbrs_bb = nn(n_neighbors=k + 1).fit(
        vec_B)  # +1 because self is also counted
    dists_ba, inds_ba = nbrs_ba.kneighbors(vec_A)  # m closest in B for A
    dists_bb, inds_bb = nbrs_bb.kneighbors(vec_B)  # k closest in B for itself
    nbrs_ab = nn(n_neighbors=m).fit(vec_A)
    nbrs_aa = nn(n_neighbors=k + 1).fit(
        vec_A)  # +1 because self is also counted
    dists_ab, inds_ab = nbrs_ab.kneighbors(vec_B)  # m closest in A for B
    dists_aa, inds_aa = nbrs_aa.kneighbors(vec_A)  # k closest in A for itself

    for i in range(n_particles):
        farthest = vec_A[inds_aa[i, -1]]
        V = 8.0 * np.pi**2 * norm(farthest - vec_A[i])**5 / 15.0
        f_aa = k / V
        farthest = vec_B[inds_ba[i, k - 1]]
        V = 8.0 * np.pi**2 * norm(farthest - vec_A[i])**5 / 15.0
        f_ba = k / V
        if f_aa > f_ba:
            K_b = 0
            for j in inds_ba[i]:
                farthest = vec_B[inds_bb[j, -1]]
                # V = 8.0 * np.pi**2 * norm(farthest - vec_B[j, 2:])**5 / 15.0
                # f_b = k / V
                # tau = (V / k)**(1.0/5.0) / np.sqrt(2*np.pi)
                V = 8.0 * np.pi * norm(farthest - vec_B[j])**3 / 15.0
                f_b = k / V
                tau = (V / k)**(1.0 / 5.0) / np.sqrt(2 * np.pi)
                K_b += f_b * np.exp(-0.5 * np.dot(
                    vec_A[i] - vec_B[j], vec_A[i] - vec_B[j]) / tau**2)
            w[i] = K_b

    for i in range(n_particles):
        farthest = vec_B[inds_bb[i, -1]]
        V = 8.0 * np.pi**2 * norm(farthest - vec_B[i])**5 / 15.0
        f_bb = k / V
        farthest = vec_A[inds_ab[i, k - 1]]
        V = 8.0 * np.pi**2 * norm(farthest - vec_B[i])**5 / 15.0
        f_ab = k / V
        if f_bb > f_ab:
            K_a = 0
            for j in inds_ab[i]:
                farthest = vec_A[inds_aa[j, -1]]
                # V = 8.0 * np.pi**2 * norm(farthest - vec_A[j, 2:])**5 / 15.0
                # f_a = k / V
                # tau = (V / k)**(1.0/5.0) / np.sqrt(2*np.pi)
                V = 8.0 * np.pi * norm(farthest - vec_A[j])**5 / 15.0
                f_a = k / V
                tau = (V / k)**(1.0 / 5.0) / np.sqrt(2 * np.pi)
                K_a += f_a * np.exp(-0.5 * np.dot(
                    vec_B[i] - vec_A[j], vec_B[i] - vec_A[j]) / tau**2)
            w[i + n_particles] = K_a
    w /= np.sum(w)

    vec_res = np.sum(w[:n_particles, None] * vec_A, axis=0)
    vec_res += np.sum(w[n_particles:, None] * vec_B, axis=0)
    m_res = vec_res[:2]
    Res = np.array([[vec_res[2], vec_res[3]], [vec_res[3], vec_res[4]]])
    Res = np.dot(Res, Res)

    return m_res, Res
Exemple #14
0
def ne_scikitalgo(ne_df):

# def ne_scikitalgo():       
	try:
		uri ='mongodb://*****:*****@ds035315-a0.mongolab.com:35315,ds035315-a1.mongolab.com:35315/newttobackground?replicaSet=rs-ds035315'
		client = MongoClient(uri)
		
		newttobackground = client.newttobackground
		
		logging.info('connected')
	
		ne_cursor = newttobackground.ttoopvalcoll.find({"route":"NEWARK-EDISON"})
		netestData = []
		netrainData = []
		nerealTime = []
		time = []
		
		

		for doc in ne_cursor:
			netrainData.append([float(doc['Zone']),float(doc['Temparature'][0]),float(doc['Temparature'][1]),float(doc['CodedWeather'][0]),float(doc['CodedWeather'][1]),float(doc['CodedDay'])])
			
			nerealTime.append(doc['realTime'])

		for z,t1,t2,w1,w2,c,d in zip(ne_df['Zone'],ne_df['Temparature1'],ne_df['Temparature2'],ne_df['CodedWeather1'],ne_df['CodedWeather2'],ne_df['CodedDay'],ne_df['Date']):
			netestData.append([float(z),float(t1),float(t2),float(w1),float(w2),c])
			time.append(d)

		logging.info("netrainData length %d"%(len(netrainData)))
		logging.info("netestData length %d"%(len(netestData)))

		

		neigh = nn(n_neighbors = 5)
		neigh.fit(netrainData)
		nn(algorithm = 'auto',metric = 'euclidean')


		distances = []
		indexs = []
		data = []

		for i in netestData:
			data.append(neigh.kneighbors(i))

		for i in range(len(data)):
			distances.append(data[i][0])
			indexs.append(data[i][1])


		predicted_ind = []
		predicted_val = []  # we are considering the realTime in this case



		for i in range(len(indexs)):
			predicted_ind.append(indexs[i][0])


		new_predicted_ind = list(chain.from_iterable(predicted_ind))
		
		for k in new_predicted_ind:
			predicted_val.append(nerealTime[k])  # nerealTime is the list where training set realTime values stored



		# seperating them as list of five for individual 5 neighbors

		listoffive = []

		for i in range(0,len(predicted_val),5):
			listoffive.append(predicted_val[i:i+5])
		
		prediction = []
		for i in range(len(listoffive)):
			prediction.append(listoffive[i][0])
		
		predictioninmins = []
		for i in prediction:
			predictioninmins.append(float(i)/60.0)

		docCount = newttobackground.ttoresultcoll.find({"route":"NEWARK-EDISON"}).count()
		logging.info('NE -> before adding new results docCount %d'%(docCount))
		
		
		'''for testing purpose im closing it i will comeback again'''
		lowleveldelList = [] # for the below 6hrs range
		highleveldelList = [] # for the regular update delete pupose
		newarkedison_time = datetime.datetime.now(pytz.timezone('US/Eastern')) 
		newarkedison_dayname = newarkedison_time.strftime("%A")
		newarkedison_hour = int(newarkedison_time.strftime("%H"))	
		newarkedison_minute = int(newarkedison_time.strftime("%M"))
		newarkedison_second = int(newarkedison_time.strftime("%S"))
		newarkedison_year 	= int(newarkedison_time.strftime("%Y"))
		newarkedison_month	= int(newarkedison_time.strftime("%m"))
		newarkedison_day	= int(newarkedison_time.strftime("%d"))
		presentTime = datetime.datetime(newarkedison_year,newarkedison_month,newarkedison_day,newarkedison_hour,newarkedison_minute,newarkedison_second)
		
		sixhrLimit = presentTime-datetime.timedelta(hours=6)
		logging.info("ne six hours back time %s"%(str(sixhrLimit)))

		highleveldelCursor = newttobackground.ttoresultcoll.find({"route":"NEWARK-EDISON","time" :{ "$gt":presentTime}})
		lowleveldelCursor = newttobackground.ttoresultcoll.find({"route":"NEWARK-EDISON","time" :{ "$lt":sixhrLimit}})
		
		for docid in highleveldelCursor:
			highleveldelList.append(docid['_id'])
		for docid in lowleveldelCursor:
			lowleveldelList.append(docid['_id'])
		combinedDelList = []
		combinedDelList.extend(lowleveldelList)
		combinedDelList.extend(highleveldelList)
		
		logging.info("ne docs before sixhourslimit %d"%(len(lowleveldelList)))
		logging.info("ne regular update doc length %d"%(len(highleveldelList)))	
		
		

		newttobackground.ttoresultcoll.remove({'_id':{"$in":combinedDelList}}) # Dangerous line
			


		
		for i in range(len(time)):
			doc = {
					"route":"NEWARK-EDISON",

					"time":time[i],
					"predictioninsecs":prediction[i],
					"predictioninmins":predictioninmins[i]
							}
			docid = newttobackground.ttoresultcoll.insert_one(doc)
			del doc
		
		docCount = newttobackground.ttoresultcoll.find({"route":"NEWARK-EDISON"}).count()
		logging.info('NE -> after adding new results docCount %d'%(docCount))
		
		return True	
	except Exception as e:
		logging.error("The exception occured in ne_scikit %s,%s"%(e,type(e)))
		return False	
Exemple #15
0
    classification_count = 0

    #construction of K-coefficients of each image in the training set
    Kcoeff_train = []
    for i in raw_matrix:
        Kcoeff_train.append(np.dot(i, np.transpose(Eigen_faces[:K])))

    Kcoeff_test = []
    for i in test_matrix:
        Kcoeff_test.append(np.dot(i, np.transpose(Eigen_faces[:K])))

    Kcoeff_test = array(Kcoeff_test)
    Kcoeff_train = array(Kcoeff_train)

    for i in range(Kcoeff_test.shape[0]):
        neigh = nn(n_neighbors=1)
        neigh.fit(Kcoeff_train)
        classification_index = neigh.kneighbors(Kcoeff_test[i].reshape(1,
                                                                       -1))[1]
        if classification_index // 8 == i // 2:
            classification_count += 1

    percentage_of_error = (Kcoeff_test.shape[0] -
                           classification_count) / Kcoeff_test.shape[0]
    print("Classification Error over test sample for K={} coefficients is {}".
          format(K, 100 * percentage_of_error))

    #Step7: Take some random image from test set and comput K coefficients
    random_test_image = np.array(
        Image.open(testdata_path +
                   test_list[np.random.randint(0, len(test_list))])).flatten()
print dgts_data.shape
print dgts_data

dgts_lbl = pd.read_csv("abcd_l.csv",index_col=0)
print dgts_lbl.head()
print dgts_lbl.shape
dgts_lbl = np.array(dgts_lbl)
print dgts_lbl.shape
print dgts_lbl

#train a KNN and see how does it perform. Keep 50000 for training and 10000 for validation and 10000 for final test.

gen_k_sets = StratifiedShuffleSplit(dgts_lbl, n_iter=1, test_size=0.20)
mdl = SVC()
mdl = rfc()
dst_mdl = nn(n_neighbors=100)

for train_index, test_index in gen_k_sets:   
    train_data, test_data = dgts_data[train_index], dgts_data[test_index]
    train_class, test_class = dgts_lbl[train_index], dgts_lbl[test_index]
    #test_data= test_data[:1000,]
    #test_class = test_class[:1000]
    #print g
    
    dst_mdl.fit(train_data)
    #print mdl.score(train_data,train_class)
    print train_data.shape
    j = 0
    for i,td in enumerate(test_data):
        td = np.array(td)
        tst_class_act=test_class[i]
q = range(0, 101, 10)
perc = np.reshape(np.percentile(dgts_data, q, axis=1), (dgts_data.shape[0], len(q)))
# dgts_data = np.hstack((dgts_data,perc))

dgts_lbl = pd.read_csv("abcd_l.csv", index_col=0)
# print dgts_lbl.head()
print dgts_lbl.shape
dgts_lbl = np.array(dgts_lbl)
print dgts_lbl.shape
# print dgts_lbl

# train a KNN and see how does it perform. Keep 50000 for training and 10000 for validation and 10000 for final test.

gen_k_sets = StratifiedShuffleSplit(dgts_lbl, n_iter=1, test_size=0.15)
mdl = knn(n_neighbors=9)
dst_mdl = nn(n_neighbors=9)

for train_index, test_index in gen_k_sets:
    train_data, test_data = dgts_data[train_index], dgts_data[test_index]
    train_class, test_class = dgts_lbl[train_index], dgts_lbl[test_index]
    train_class = np.reshape(np.ravel(train_class), (train_data.shape[0], 1))
    print train_class.shape

    # test_data= test_data[:1000,]
    # test_class = np.reshape(test_class[:1000],(1000,1))
    print test_class.shape
    print train_data.shape

    mdl.fit(train_data, train_class)
    # print mdl.score(train_data,train_class)
Exemple #18
0

def getInput(input):
    # lay du lieu input roi chuan hoa
    for idx, c in enumerate(input):
        input[idx] = mt.cvlct(c)
    while len(input) < maxNum:
        input.append(zero)
    return [input]


def printResult(indices, data):
    # convert ket qua tu dang chuan hoa sang du lieu doc duoc
    for idx in indices:
        for c in data[idx]:
            print(mt.rvlct(c) if (c != zero) else "", end="")
        print(" ", end="")
    print("")


with open('Test2.csv', newline='\n') as csvfile:
    data = list(csv.reader(csvfile))

print("Number of words: %d" % len(data))
print("Max length: %d" % maxNum)
normData(data)
while True:
    word = getInput(input=list(input("Enter string: ").lower()))
    nb = nn(n_neighbors=nbNum, algorithm='brute', metric=mt.compare).fit(data)
    distance, indices = nb.kneighbors(word)
    printResult(indices=indices[0], data=data)
Exemple #19
0
def main(n, CI, testFile):

    trainFile = 'fitData.csv'
    '''
    trainfile = 'complete.csv'
    testFile = 'incomplete.csv'
    '''

    comp_id, comp_date, comp_data2d = getData(trainFile, CI)
    numbers = nn(n_neighbors=n, algorithm='auto').fit(comp_data2d)

    if CI == True:
        x = '_CI_only'
    else:
        x = ''

    with open(testFile, 'r') as f, open(
        ("pred" + str(n) + x + testFile), 'w', newline=''
    ) as wr1:  #open (("padd" + x + testFile), 'w', newline= '') as wrP:
        inp = csv.reader(f,
                         skipinitialspace=True,
                         delimiter=',',
                         quotechar='|')
        out1 = csv.writer(wr1, delimiter=",", quotechar='|')
        #outP = csv.writer(wrP, delimiter=",", quotechar='|')
        out = [out1]

        fn = next(inp)
        if CI == True:
            d = fn.index('SavingAcnt')
            del fn[d:]
        for o in out:
            o.writerow(fn)

        test = []
        date = []
        id = []
        x = 0

        for row in inp:
            if CI == True:
                del row[d:]
            if x == 0:
                date.append(int(float(row.pop(0))))
                id.append(row.pop(0))
                x = id[len(id) - 1]
                test.append(row)
            elif x == row[1]:
                date.append(int(float(row.pop(0))))
                id.append(row.pop(0))
                test.append(row)
            else:
                output = []
                CusID = id[0]
                missing_m = missingM(date)
                output = impute(date, id, test, numbers, comp_id, comp_data2d,
                                fn, CusID, missing_m)
                #output.append(padding(CusID, missing_m, date, test, fn))
                for i in range(len(output)):
                    for ans in output[i]:
                        out[i].writerow(ans)

                date, id, test = [], [], []
                date.append(int(float(row.pop(0))))
                id.append(row.pop(0))
                test.append(row)
                x = id[len(id) - 1]

        CusID = id[0]
        missing_m = missingM(date)
        output = []
        output = impute(date, id, test, numbers, comp_id, comp_data2d, fn,
                        CusID, missing_m)
        #output.append(padding(CusID, missing_m, date, test, fn))
        for i in range(len(output)):
            for ans in output[i]:
                out[i].writerow(ans)
def test_dimension(iter_steps, smallX, targetNeighbors):
    recallT = []
    recallF = []
    recallTU = []
    recallFU = []
    recallTL = []
    recallFL = []
    for k in range(800, 825, 5):
        rpb = Merp([128, 128], k, rand_type='g', target='col', tensor=False)
        X_train, X_test = train_test_split(smallX,
                                           test_size=0.05,
                                           train_size=0.95,
                                           random_state=23)
        true_neigh = nn(n_neighbors=targetNeighbors)
        true_neigh.fit(X_train, False)
        recall = 0
        recalllist = []
        for i in range(iter_steps):
            rpb.regenerate_omega()
            X_train_rp = rpb.transform(X_train)
            X_test_rp = rpb.transform(X_test)
            # generate predictions
            rp_neigh = nn(n_neighbors=targetNeighbors)
            rp_neigh.fit(X_train_rp)
            # query and calculate recall rate
            true_distances, true_indices = true_neigh.kneighbors(
                X_test, targetNeighbors)
            pred_distances, pred_indices = rp_neigh.kneighbors(
                X_test_rp, targetNeighbors)

            curr_recall = np.asarray([
                np.intersect1d(true_indices[u], pred_indices[u]).size
                for u in range(X_test.shape[0])
            ]) / targetNeighbors
            recall = recall + curr_recall.mean()
            recalllist.append(curr_recall.mean())
        recallF.append(recall / iter_steps)
        recallFU.append(np.percentile(recalllist, 97.5))
        recallFL.append(np.percentile(recalllist, 2.5))

    for k in range(800, 825, 5):
        rpb = Merp([128, 128], k, rand_type='g', target='col', tensor=True)
        X_train, X_test = train_test_split(smallX,
                                           test_size=0.05,
                                           train_size=0.95,
                                           random_state=23)
        true_neigh = nn(n_neighbors=targetNeighbors)
        true_neigh.fit(X_train, False)
        recall = 0
        recalllist = []
        for i in range(iter_steps):
            rpb.regenerate_omega()
            X_train_rp = rpb.transform(X_train)
            X_test_rp = rpb.transform(X_test)
            # generate predictions
            rp_neigh = nn(n_neighbors=targetNeighbors)
            rp_neigh.fit(X_train_rp)
            # query and calculate recall rate
            true_distances, true_indices = true_neigh.kneighbors(
                X_test, targetNeighbors)
            pred_distances, pred_indices = rp_neigh.kneighbors(
                X_test_rp, targetNeighbors)
            # print('hello')
            # print(len(true_neighbors))
            # print(len(pred_neighbors))
            # print(len(X_test.shape))
            curr_recall = np.asarray([
                np.intersect1d(true_indices[u], pred_indices[u]).size
                for u in range(X_test.shape[0])
            ]) / targetNeighbors
            recall = recall + curr_recall.mean()
            recalllist.append(curr_recall.mean())
        recallT.append(recall / iter_steps)
        recallTU.append(np.percentile(recalllist, 97.5))
        recallTL.append(np.percentile(recalllist, 2.5))
    return [recallF, recallFU, recallFL, recallT, recallTU, recallTL]
from sklearn.neighbors import NearestNeighbors as nn
import numpy as np

# Data set
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])

# Query set
Y = np.array([[0, 1], [2, 3]])

nbrs = nn(n_neighbors=2, algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(Y)

print(indices)
print(distances)
Exemple #22
0
def bd_scikitalgo(bd_df):
	try:
		uri ='mongodb://*****:*****@ds035315-a0.mongolab.com:35315,ds035315-a1.mongolab.com:35315/newttobackground?replicaSet=rs-ds035315'
		client = MongoClient(uri)
		
		newttobackground = client.newttobackground
		
		logging.info('connected')
	
		bd_cursor = newttobackground.ttoopvalcoll.find({"route":"BROOKLYN-DENVILLE"})
		bdtestData = []
		bdtrainData = []
		bdrealTime = []
		time = []

		
		for doc in bd_cursor:
			bdtrainData.append([float(doc['Zone']),float(doc['Temparature'][0]),float(doc['Temparature'][1]),float(doc['CodedWeather'][0]),float(doc['CodedWeather'][1]),float(doc['CodedDay'])])
			bdrealTime.append(doc['realTime'])
	
		for z,t1,t2,w1,w2,c,d in zip(bd_df['Zone'],bd_df['Temparature1'],bd_df['Temparature2'],bd_df['CodedWeather1'],bd_df['CodedWeather2'],bd_df['CodedDay'],bd_df['Date']):
			bdtestData.append([float(z),float(t1),float(t2),float(w1),float(w2),c])
			time.append(d)
	 	
		logging.info("bdtrainData length %d"%(len(bdtrainData)))
		logging.info("bdtestData length %d"%(len(bdtestData)))

		neigh = nn(n_neighbors = 5)
		neigh.fit(bdtrainData)
		nn(algorithm = 'auto',metric = 'euclidean')


		distances = []
		indexs = []
		data = []

		for i in bdtestData:
			data.append(neigh.kneighbors(i))

		for i in range(len(data)):
			distances.append(data[i][0])
			indexs.append(data[i][1])


		predicted_ind = []
		predicted_val = []  # we are considering the realTime in this case



		for i in range(len(indexs)):
			predicted_ind.append(indexs[i][0])



		new_predicted_ind = list(chain.from_iterable(predicted_ind))
			

		# extracting realTime of predictions

		for k in new_predicted_ind:
			predicted_val.append(bdrealTime[k])  # bdrealTime is the list where training set realTime values stored



		# seperating them as list of five for individual 5 neighbors

		listoffive = []

		for i in range(0,len(predicted_val),5):
			listoffive.append(predicted_val[i:i+5])
		
		prediction = []
		for i in range(len(listoffive)):
			prediction.append(listoffive[i][0])
		
		predictioninmins = []
		for i in prediction:
			predictioninmins.append(float(i)/60.0)

		
		
		docCount = newttobackground.ttoresultcoll.find({"route":"BROOKLYN-DENVILLE"}).count()
		logging.info('BD -> before adding new results docCount %d'%(docCount))

		
		lowleveldelList = [] # for the below 6hrs range
		highleveldelList = [] # for the regular update delete pupose
		brooklyndenville_time = datetime.datetime.now(pytz.timezone('US/Eastern'))
			
		brooklyndenville_dayname = brooklyndenville_time.strftime("%A")
		brooklyndenville_hour = int(brooklyndenville_time.strftime("%H"))	
		brooklyndenville_minute = int(brooklyndenville_time.strftime("%M"))
		brooklyndenville_second = int(brooklyndenville_time.strftime("%S"))
		brooklyndenville_year = int(brooklyndenville_time.strftime("%Y"))
		brooklyndenville_month	= int(brooklyndenville_time.strftime("%m"))
		brooklyndenville_day	= int(brooklyndenville_time.strftime("%d")) 
		presentTime = datetime.datetime(brooklyndenville_year,brooklyndenville_month,brooklyndenville_day,brooklyndenville_hour,brooklyndenville_minute,brooklyndenville_second)
				
		sixhrLimit = presentTime-datetime.timedelta(hours=6)
		logging.info("bd six hours back time %s"%(str(sixhrLimit)))

		highleveldelCursor = newttobackground.ttoresultcoll.find({"route":"BROOKLYN-DENVILLE","time" :{ "$gt":presentTime}})
		lowleveldelCursor = newttobackground.ttoresultcoll.find({"route":"BROOKLYN-DENVILLE","time" :{ "$lt":sixhrLimit}})
		
		for docid in highleveldelCursor:
			highleveldelList.append(docid['_id'])
		for docid in lowleveldelCursor:
			lowleveldelList.append(docid['_id'])
		combinedDelList = []
		combinedDelList.extend(lowleveldelList)
		combinedDelList.extend(highleveldelList)
		
		logging.info("bd docs before sixhourslimit %d"%(len(lowleveldelList)))
		logging.info("bd regular update doc length %d"%(len(highleveldelList)))	
		

		
		newttobackground.ttoresultcoll.remove({'_id':{"$in":combinedDelList}}) # Dangerous line
			




		for i in range(len(time)):
			doc = {
					"route":"BROOKLYN-DENVILLE",

					"time":time[i],
					"predictioninsecs":prediction[i],
					"predictioninmins":predictioninmins[i]
							}
			docid = newttobackground.ttoresultcoll.insert_one(doc)
			del doc

		docCount = newttobackground.ttoresultcoll.find({"route":"BROOKLYN-DENVILLE"}).count()
		logging.info('BD -> after adding new results docCount %d'%(docCount))			
			
		return True	
	except Exception as e:
		logging.error("The exception occured in bd_scikit %s,%s"%(e,type(e)))
		return False
Exemple #23
0
#dla czytelnosci nazwy - zamien taki html na pusta linie
df['nazwa'] = [re.sub("<.*?>", "", text) for text in df['nazwa']]

print(df)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.set_xlabel('Ocena przez kadre akademicka')
ax.set_ylabel('Losy absolwentow')
ax.set_zlabel('Oceny parametryczna')

ax.scatter(df['opka'], df['ela'], df['op'])

for index, row in df.iterrows():
    ax.text(row['opka'], row['ela'], row['op'], row['nazwa'])

#plt.show()

X = df.iloc[:, 1:4]  #iloc - integer location
y = df.iloc[:, 0]

nbrs = nn(n_neighbors=2)
model = nbrs.fit(X)

#dist, idx = model.kneighbors(X)

recommendations = model.kneighbors([[100, 100, 100]], 5, return_distance=False)

for idx in recommendations:
    print(y[idx])
Exemple #24
0
def sf_scikitalgo(sf_df):
	try:
		uri ='mongodb://*****:*****@ds035315-a0.mongolab.com:35315,ds035315-a1.mongolab.com:35315/newttobackground?replicaSet=rs-ds035315'
		client = MongoClient(uri)
		
		newttobackground = client.newttobackground
		
		logging.info('connected')
	
		sf_cursor = newttobackground.ttoopvalcoll.find({"route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL"})
		sftestData = []
		sftrainData = []
		sfrealTime = []
		time =[]

		
		for doc in sf_cursor:
			sftrainData.append([float(doc['Zone']),float(doc['Temparature'][0]),float(doc['CodedWeather'][0]),float(doc['CodedDay'])])
			sfrealTime.append(doc['realTime'])

		for z,t1,w1,c,d in zip(sf_df['Zone'],sf_df['Temparature1'],sf_df['CodedWeather1'],sf_df['CodedDay'],sf_df['Date']):
			sftestData.append([float(z),float(t1),float(w1),c])
			time.append(d)	
	
			
					

		logging.info("sftrainData length %d"%(len(sftrainData)))
		logging.info("sftestData length %d"%(len(sftestData)))

		neigh = nn(n_neighbors = 5)
		neigh.fit(sftrainData)
		nn(algorithm = 'auto',metric = 'euclidean')


		distances = []
		indexs = []
		data = []

		for i in sftestData:
			data.append(neigh.kneighbors(i))

		for i in range(len(data)):
			distances.append(data[i][0])
			indexs.append(data[i][1])


		predicted_ind = []
		predicted_val = []  # we are considering the realTime in this case
		


		for i in range(len(indexs)):
			predicted_ind.append(indexs[i][0])


		new_predicted_ind = list(chain.from_iterable(predicted_ind))
		
		
		# extracting realTime of predictions
		for k in new_predicted_ind:
			predicted_val.append(sfrealTime[k])  # sfrealTime is the list where training set realTime values stored

		# seperating them as list of five for individual 5 neighbors

		listoffive = []

		for i in range(0,len(predicted_val),5):
			listoffive.append(predicted_val[i:i+5])
		
		prediction = []
		for i in range(len(listoffive)):
			prediction.append(listoffive[i][0])
		
		predictioninmins = []
		for i in prediction:
			predictioninmins.append(float(i)/60.0)	


		
		docCount = newttobackground.ttoresultcoll.find({"route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL"}).count()	
		logging.info('SF -> before adding new results docCount %d'%(docCount))


		
		lowleveldelList = [] # for the below 6hrs range
		highleveldelList = [] # for the regular update delete pupose

		sanfrancisco_time = datetime.datetime.now(pytz.timezone('US/Pacific'))
		sanfrancisco_dayname = sanfrancisco_time.strftime("%A")
		sanfrancisco_hour = int(sanfrancisco_time.strftime("%H"))	
		sanfrancisco_minute = int(sanfrancisco_time.strftime("%M"))
		sanfrancisco_second = int(sanfrancisco_time.strftime("%S"))
		sanfrancisco_year = int(sanfrancisco_time.strftime("%Y"))
		sanfrancisco_month	=int(sanfrancisco_time.strftime("%m"))
		sanfrancisco_day	= int(sanfrancisco_time.strftime("%d")) 
		presentTime = datetime.datetime(sanfrancisco_year,sanfrancisco_month,sanfrancisco_day,sanfrancisco_hour,sanfrancisco_minute,sanfrancisco_second)
				
		sixhrLimit = presentTime-datetime.timedelta(hours=6)
		logging.info("sf six hours back time %s"%(str(sixhrLimit)))

		highleveldelCursor = newttobackground.ttoresultcoll.find({"route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL","time" :{ "$gt":presentTime}})
		lowleveldelCursor = newttobackground.ttoresultcoll.find({"route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL","time" :{ "$lt":sixhrLimit}})
		
		for docid in highleveldelCursor:
			highleveldelList.append(docid['_id'])
		for docid in lowleveldelCursor:
			lowleveldelList.append(docid['_id'])

		logging.info("sf docs before sixhourslimit %d"%(len(lowleveldelList)))
		logging.info("sf regular update doc length %d"%(len(highleveldelList)))	

		combinedDelList = []
		combinedDelList.extend(lowleveldelList)
		combinedDelList.extend(highleveldelList)
		
		newttobackground.ttoresultcoll.remove({'_id':{"$in":combinedDelList}}) # Dangerous line
			

		for i in range(len(time)):
			doc = {
					"route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL",

					"time":time[i],
					"predictioninsecs":prediction[i],
					"predictioninmins":predictioninmins[i]
							}
			docid = newttobackground.ttoresultcoll.insert_one(doc)
			del doc

		docCount = newttobackground.ttoresultcoll.find({"route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL"}).count()	
		logging.info('SF -> after adding new results docCount%d'%(docCount))
		
			
		return True		

	except Exception as e:
		logging.error("The exception for sf_scikit %s,%s"(e,type(e)))
		return False
Exemple #25
0
from sklearn.neighbors import NearestNeighbors as nn
import numpy as np
from sklearn.externals.joblib import dump
data = np.load('findata.npy')
nbrs = nn(n_neighbors=2).fit(data)
dump(nbrs, 'nbrs.joblib')