def ChiTreeFlux(dataFlux,dDataFlux,modelFlux): ''' Finds the model that minimizes chi^2 distance for each object in flux space using a ball_tree search with seuclidean metric (weighting each dimension by the variance in that flux) Inputs: dataFlux = observed fluxes, array of size (#objects,#filters) dDataFlux = flux uncertainties, array of size (#objects,#filters) modelFlux = fluxes of models, array of size (#models,#filters) Output: NumPy array of size (#objects,3) Columns [index of model with min chi^2, scale factor, chi^2 value] ''' results = np.array([]).reshape(0,3) for i in range(len(dataFlux)): scales = fit_tools.Scale(modelFlux,dataFlux[i],dDataFlux[i]) scaledModelFlux = (modelFlux.transpose() * scales.transpose()).transpose() tree = nn(n_neighbors=1,algorithm='ball_tree',metric='seuclidean',metric_params={'V':dDataFlux[i]**2}) tree.fit(scaledModelFlux) query = tree.kneighbors(dataFlux[i],1) n,chi2 = query[1][0][0],query[0][0][0]**2. s = scales[int(n)] results = np.r_[results,[[n,s,chi2]]] return(results)
def ChiTree(dataFlux, dDataFlux, modelFlux): """ Finds the model that minimizes chi^2 distance for each object using a ball_tree search with seuclidean metric (weighting each dimension by the variance in that flux) Inputs: dataFlux = observed fluxes, array of size (#objects,#filters) dDataFlux = flux uncertainties, array of size (#objects,#filters) modelFlux = fluxes of models, array of size (#models,#filters) Output: NumPy array of size (#objects,3) Columns [index of model with min chi^2, scale factor, chi^2 value] """ modelColors = modelFlux[:, 1:] / modelFlux[:, :-1] dataColors = dataFluxes[:, 1:] / dataFluxes[:, :-1] dDataColors = np.sqrt( (1.0 / dataFluxes[:, :-1]) ** 2 * (dDataFluxes[:, 1:]) ** 2 + (dataFluxes[:, 1:] / dataFluxes[:, :-1] ** 2) ** 2 * (dDataFluxes[:, :-1]) ** 2 ) results = np.array([]).reshape(0, 3) for i in range(len(dataFlux)): tree = nn(n_neighbors=1, algorithm="ball_tree", metric="seuclidean", metric_params={"V": dDataColors[i] ** 2}) tree.fit(modelColors) query = tree.kneighbors(dataColors[i], 1) n, chi2 = query[1][0][0], query[0][0][0] ** 2.0 s = Scale(modelFlux[n], dataFlux[i], dDataFlux[i]) results = np.r_[results, [[n, s, chi2]]] return results
def ChiTree(dataFlux, dDataFlux, modelFlux): ''' Finds the model that minimizes chi^2 distance for each object using a ball_tree search with seuclidean metric (weighting each dimension by the variance in that flux) Inputs: dataFlux = observed fluxes, array of size (#objects,#filters) dDataFlux = flux uncertainties, array of size (#objects,#filters) modelFlux = fluxes of models, array of size (#models,#filters) Output: NumPy array of size (#objects,3) Columns [index of model with min chi^2, scale factor, chi^2 value] ''' modelColors = modelFlux[:, 1:] / modelFlux[:, :-1] dataColors = dataFluxes[:, 1:] / dataFluxes[:, :-1] dDataColors = np.sqrt( (1./dataFluxes[:,:-1])**2 * (dDataFluxes[:,1:])**2 \ + (dataFluxes[:,1:]/dataFluxes[:,:-1]**2)**2 * (dDataFluxes[:,:-1])**2) results = np.array([]).reshape(0, 3) for i in range(len(dataFlux)): tree = nn(n_neighbors=1, algorithm='ball_tree', metric='seuclidean', metric_params={'V': dDataColors[i]**2}) tree.fit(modelColors) query = tree.kneighbors(dataColors[i], 1) n, chi2 = query[1][0][0], query[0][0][0]**2. s = Scale(modelFlux[n], dataFlux[i], dDataFlux[i]) results = np.r_[results, [[n, s, chi2]]] return (results)
def train_model(data, k=K): start = time.time() model = nn() model.set_params(n_neighbors=k, algorithm='auto', metric='cosine') model.fit(data) end = time.time() print("training took {}".format(end - start)) return model
def fuse_densities_direct(vec_A, vec_B, m, k, n_particles): # based on Hanebeck paper w = np.zeros(n_particles) nbrs_a = nn(n_neighbors=m).fit(vec_B) nbrs_b = nn(n_neighbors=k + 1).fit( vec_B) # +1 because self is also counted dists_a, inds_a = nbrs_a.kneighbors(vec_A) # m closest in B for A dists_b, inds_b = nbrs_b.kneighbors(vec_B) # k closest in B for itself for i in range(n_particles): K_b = 0 for j in inds_a[i]: farthest = vec_B[inds_b[j, -1]] # V = 8.0 * np.pi**2 * norm(farthest - vec_B[j, 2:])**5 / 15.0 # f_b = k / V # tau = (V / k)**(1.0/5.0) / np.sqrt(2*np.pi) V = 8.0 * np.pi**2 * norm(farthest - vec_B[j])**5 / 15.0 f_b = k / V tau = (V / k)**(1.0 / 5.0) / np.sqrt(2 * np.pi) K_b += f_b * np.exp(-0.5 * np.dot(vec_A[i] - vec_B[j], vec_A[i] - vec_B[j]) / tau**2) w[i] = K_b w /= np.sum(w) # n = 4 # bins = np.linspace(np.amin(vec_B[:, n]), np.amax(vec_B[:, n]), 20) # plt.hist(vec_B[:, n], bins, alpha=0.5) # bins = np.linspace(np.amin(vec_A[:, n]), np.amax(vec_A[:, n]), 20) # plt.hist(vec_A[:, n], bins, alpha=0.5)#, weights=w) # plt.scatter(vec_A[:, 4], vec_A[:, 3], c='blue', alpha=0.2) # plt.scatter(vec_B[:, 4], vec_B[:, 3], c='red', alpha=0.2) vec_res = np.sum(w[:, None] * vec_A[:], axis=0) # plt.plot(vec_res[n-2], 100, 'o') # plt.show() # plt.scatter(vec_res[2], vec_res[1], c='green') # plt.show() m_res = vec_res[:2] Res = np.array([[vec_res[2], vec_res[3]], [vec_res[3], vec_res[4]]]) Res = np.dot(Res, Res) return m_res, Res
def compute_within_class_matrix_whitening(self): for x in np.unique(self.train.labels): who_cl = np.where(self.train.labels == x)[0] self.data_intra = self.train.mat[:, who_cl] knn = nn().fit(self.data_intra.transpose()) self.dintra, self.indintra = knn.kneighbors( self.data_intra.transpose()) self.dintra = self.dintra.transpose() self.indintra = self.indintra.transpose() self.dintra[self.K, :] = [] self.indintra[self.K, :] = [] self.valIn[:, who_cl] = self.dintra[1, :] self.indIn[:, who_cl] = who_cl[self.indintra[1, :]]
def _knn(t, min_freq, save=False): if save: nbrs = nn() parameters = { 'n_neighbors': [1], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] } clf = GridSearchCV(nbrs, parameters, cv=5) clf.fit(records, labels) save_classifier(clf.best_estimator_, t, 'knn', min_freq) return ('knn', clf.best_estimator_) else: clf = load_classifier(t, 'knn', min_freq) return ('knn', clf)
def simulate(X, num_iters=10, n_neighbors=10, **merpCfg): # normalize input data X = normalization(X) # the merp instance rp = Merp(merpCfg) # split train and test set X_train, X_test = train_test_split(X, test_size=0.05, train_size=0.95, random_state=23) # generate ground truth true_neigh = nn(n_neighbors=10) true_neigh.fit(X_train, return_distance=False) recall = [] for i in range(num_iters): rp.regenerate_omega() X_train_rp = rp.transform(X_train) X_test_rp = rp.transform(X_test) # generate predictions rp_neigh = nn(n_neighbors=n_neighbors) rp_neigh.fit(X_train_rp) # query and calculate recall rate true_neighbors = true_neigh.kneighbors(X_test) pred_neighbors = rp_neigh.kneighbors(X_test_rp) curr_recall = np.asarray([ np.intersect1d(true_neighbors[i], pred_neighbors[i]).size for i in range(X_test.shape[0]) ]) / n_neighbors recall.append(curr_recall.mean()) return recall
def compute_bet_class_cluster_dist(self): for x in np.unique(self.train.labels): who_cl = np.where(self.train.labels == x)[0] who_notcl = np.where((self.train.labels != x))[0] self.data_intra = self.Wtr[:, who_cl] self.data_extra = self.Wtr[:, who_notcl] knn = nn().fit(self.data_extra.transpose()) self.dextra, self.indextra = knn.kneighbors( self.data_intra.transpose()) self.dextra = self.dextra.transpose() self.indextra = self.indextra.transpose() self.indEx[:, who_cl] = who_notcl[self.indextra[1, :]] self.valEx[:, who_cl] = self.dextra[1, :]
def SpectralClustering(data, num_clusters=2, affinity='rbf', gamma=1.0, num_neighbors=1): if(affinity == 'rbf'): sim_matrix = rbf(data,data,gamma) elif(affinity == 'knn'): nearest_neigbhor = nn(n_neighbors=num_neighbors) nearest_neigbhor.fit(data) sim_matrix = nearest_neigbhor.kneighbors_graph(data, mode='connectivity').toarray() deg_matrix = np.diag(np.sum(sim_matrix, axis=1)) laplace_matrix = deg_matrix - sim_matrix asym_laplace_matrix = np.dot(np.linalg.inv(deg_matrix),laplace_matrix) eig_values,eig_vectors = np.linalg.eig(asym_laplace_matrix) idx = np.real(eig_values).argsort()[:num_clusters] eig_vectors = np.real(eig_vectors[:,idx]) rows_norm = np.linalg.norm(eig_vectors, axis=1) normalized_eig_vectors = (eig_vectors.T / rows_norm).T centroids,clusters,error,iter_num = Kmeans(normalized_eig_vectors, num_clusters=num_clusters) return normalized_eig_vectors,centroids,clusters
def genPlaylist(): #read in dataframes df = pd.read_csv("spotify_data/cleaned_df.csv") df.drop("Unnamed: 0", axis=1, inplace=True) sdf = pd.read_csv("spotify_data/scaled_song_data.csv") sdf.drop("Unnamed: 0", axis=1, inplace=True) #read in knn model from joblib # nghbr = joblib.load("spotify_data/knn_jblib.pkl") #read user inputted song + selected features feats = request.form['features'] #clean feature array from json feats = feats.strip('[').strip(']').split(',') feats = [e.strip('"') for e in feats] id = feats[-1] feats = feats[:-1] # get audio features from scaled csv audioVals = sdf[sdf['id'] == id][feats] #make custom training data for each features custom_df = sdf[feats] #build specific knn using user selected features neighbor = nn(n_neighbors=10, algorithm='kd_tree', metric='euclidean', n_jobs=-1) neighbor.fit(custom_df) distances, indicies = neighbor.kneighbors(X=audioVals, n_neighbors=10) #store the generated playlist in a dataframe rdf = pd.DataFrame(columns=df.columns) for i in indicies[0]: rdf = rdf.append(df.iloc[i], ignore_index=True) rdf['distance'] = pd.Series(distances[0].round(3), index=rdf.index) rdf.drop(['explicit', 'release_date', 'duration_ms'], axis=1, inplace=True) #store as JSON object rdf = rdf.to_json() return rdf
def get_recommendations(data, target): print('------------------ Processing ------------------') knn_model = nn(n_neighbors=10, metric='cosine') knn_model.fit(data) k_neighbors = knn_model.kneighbors(np.asarray(target).reshape(1, -1)) return k_neighbors[1][0]
def fuse_densities_direct_sym(vec_A, vec_B, m, k, n_particles): # based on Hanebeck paper w = np.zeros(n_particles * 2) nbrs_ba = nn(n_neighbors=m).fit(vec_B) nbrs_bb = nn(n_neighbors=k + 1).fit( vec_B) # +1 because self is also counted dists_ba, inds_ba = nbrs_ba.kneighbors(vec_A) # m closest in B for A dists_bb, inds_bb = nbrs_bb.kneighbors(vec_B) # k closest in B for itself nbrs_ab = nn(n_neighbors=m).fit(vec_A) nbrs_aa = nn(n_neighbors=k + 1).fit( vec_A) # +1 because self is also counted dists_ab, inds_ab = nbrs_ab.kneighbors(vec_B) # m closest in A for B dists_aa, inds_aa = nbrs_aa.kneighbors(vec_A) # k closest in A for itself for i in range(n_particles): farthest = vec_A[inds_aa[i, -1]] V = 8.0 * np.pi**2 * norm(farthest - vec_A[i])**5 / 15.0 f_aa = k / V farthest = vec_B[inds_ba[i, k - 1]] V = 8.0 * np.pi**2 * norm(farthest - vec_A[i])**5 / 15.0 f_ba = k / V if f_aa > f_ba: K_b = 0 for j in inds_ba[i]: farthest = vec_B[inds_bb[j, -1]] # V = 8.0 * np.pi**2 * norm(farthest - vec_B[j, 2:])**5 / 15.0 # f_b = k / V # tau = (V / k)**(1.0/5.0) / np.sqrt(2*np.pi) V = 8.0 * np.pi * norm(farthest - vec_B[j])**3 / 15.0 f_b = k / V tau = (V / k)**(1.0 / 5.0) / np.sqrt(2 * np.pi) K_b += f_b * np.exp(-0.5 * np.dot( vec_A[i] - vec_B[j], vec_A[i] - vec_B[j]) / tau**2) w[i] = K_b for i in range(n_particles): farthest = vec_B[inds_bb[i, -1]] V = 8.0 * np.pi**2 * norm(farthest - vec_B[i])**5 / 15.0 f_bb = k / V farthest = vec_A[inds_ab[i, k - 1]] V = 8.0 * np.pi**2 * norm(farthest - vec_B[i])**5 / 15.0 f_ab = k / V if f_bb > f_ab: K_a = 0 for j in inds_ab[i]: farthest = vec_A[inds_aa[j, -1]] # V = 8.0 * np.pi**2 * norm(farthest - vec_A[j, 2:])**5 / 15.0 # f_a = k / V # tau = (V / k)**(1.0/5.0) / np.sqrt(2*np.pi) V = 8.0 * np.pi * norm(farthest - vec_A[j])**5 / 15.0 f_a = k / V tau = (V / k)**(1.0 / 5.0) / np.sqrt(2 * np.pi) K_a += f_a * np.exp(-0.5 * np.dot( vec_B[i] - vec_A[j], vec_B[i] - vec_A[j]) / tau**2) w[i + n_particles] = K_a w /= np.sum(w) vec_res = np.sum(w[:n_particles, None] * vec_A, axis=0) vec_res += np.sum(w[n_particles:, None] * vec_B, axis=0) m_res = vec_res[:2] Res = np.array([[vec_res[2], vec_res[3]], [vec_res[3], vec_res[4]]]) Res = np.dot(Res, Res) return m_res, Res
def ne_scikitalgo(ne_df): # def ne_scikitalgo(): try: uri ='mongodb://*****:*****@ds035315-a0.mongolab.com:35315,ds035315-a1.mongolab.com:35315/newttobackground?replicaSet=rs-ds035315' client = MongoClient(uri) newttobackground = client.newttobackground logging.info('connected') ne_cursor = newttobackground.ttoopvalcoll.find({"route":"NEWARK-EDISON"}) netestData = [] netrainData = [] nerealTime = [] time = [] for doc in ne_cursor: netrainData.append([float(doc['Zone']),float(doc['Temparature'][0]),float(doc['Temparature'][1]),float(doc['CodedWeather'][0]),float(doc['CodedWeather'][1]),float(doc['CodedDay'])]) nerealTime.append(doc['realTime']) for z,t1,t2,w1,w2,c,d in zip(ne_df['Zone'],ne_df['Temparature1'],ne_df['Temparature2'],ne_df['CodedWeather1'],ne_df['CodedWeather2'],ne_df['CodedDay'],ne_df['Date']): netestData.append([float(z),float(t1),float(t2),float(w1),float(w2),c]) time.append(d) logging.info("netrainData length %d"%(len(netrainData))) logging.info("netestData length %d"%(len(netestData))) neigh = nn(n_neighbors = 5) neigh.fit(netrainData) nn(algorithm = 'auto',metric = 'euclidean') distances = [] indexs = [] data = [] for i in netestData: data.append(neigh.kneighbors(i)) for i in range(len(data)): distances.append(data[i][0]) indexs.append(data[i][1]) predicted_ind = [] predicted_val = [] # we are considering the realTime in this case for i in range(len(indexs)): predicted_ind.append(indexs[i][0]) new_predicted_ind = list(chain.from_iterable(predicted_ind)) for k in new_predicted_ind: predicted_val.append(nerealTime[k]) # nerealTime is the list where training set realTime values stored # seperating them as list of five for individual 5 neighbors listoffive = [] for i in range(0,len(predicted_val),5): listoffive.append(predicted_val[i:i+5]) prediction = [] for i in range(len(listoffive)): prediction.append(listoffive[i][0]) predictioninmins = [] for i in prediction: predictioninmins.append(float(i)/60.0) docCount = newttobackground.ttoresultcoll.find({"route":"NEWARK-EDISON"}).count() logging.info('NE -> before adding new results docCount %d'%(docCount)) '''for testing purpose im closing it i will comeback again''' lowleveldelList = [] # for the below 6hrs range highleveldelList = [] # for the regular update delete pupose newarkedison_time = datetime.datetime.now(pytz.timezone('US/Eastern')) newarkedison_dayname = newarkedison_time.strftime("%A") newarkedison_hour = int(newarkedison_time.strftime("%H")) newarkedison_minute = int(newarkedison_time.strftime("%M")) newarkedison_second = int(newarkedison_time.strftime("%S")) newarkedison_year = int(newarkedison_time.strftime("%Y")) newarkedison_month = int(newarkedison_time.strftime("%m")) newarkedison_day = int(newarkedison_time.strftime("%d")) presentTime = datetime.datetime(newarkedison_year,newarkedison_month,newarkedison_day,newarkedison_hour,newarkedison_minute,newarkedison_second) sixhrLimit = presentTime-datetime.timedelta(hours=6) logging.info("ne six hours back time %s"%(str(sixhrLimit))) highleveldelCursor = newttobackground.ttoresultcoll.find({"route":"NEWARK-EDISON","time" :{ "$gt":presentTime}}) lowleveldelCursor = newttobackground.ttoresultcoll.find({"route":"NEWARK-EDISON","time" :{ "$lt":sixhrLimit}}) for docid in highleveldelCursor: highleveldelList.append(docid['_id']) for docid in lowleveldelCursor: lowleveldelList.append(docid['_id']) combinedDelList = [] combinedDelList.extend(lowleveldelList) combinedDelList.extend(highleveldelList) logging.info("ne docs before sixhourslimit %d"%(len(lowleveldelList))) logging.info("ne regular update doc length %d"%(len(highleveldelList))) newttobackground.ttoresultcoll.remove({'_id':{"$in":combinedDelList}}) # Dangerous line for i in range(len(time)): doc = { "route":"NEWARK-EDISON", "time":time[i], "predictioninsecs":prediction[i], "predictioninmins":predictioninmins[i] } docid = newttobackground.ttoresultcoll.insert_one(doc) del doc docCount = newttobackground.ttoresultcoll.find({"route":"NEWARK-EDISON"}).count() logging.info('NE -> after adding new results docCount %d'%(docCount)) return True except Exception as e: logging.error("The exception occured in ne_scikit %s,%s"%(e,type(e))) return False
classification_count = 0 #construction of K-coefficients of each image in the training set Kcoeff_train = [] for i in raw_matrix: Kcoeff_train.append(np.dot(i, np.transpose(Eigen_faces[:K]))) Kcoeff_test = [] for i in test_matrix: Kcoeff_test.append(np.dot(i, np.transpose(Eigen_faces[:K]))) Kcoeff_test = array(Kcoeff_test) Kcoeff_train = array(Kcoeff_train) for i in range(Kcoeff_test.shape[0]): neigh = nn(n_neighbors=1) neigh.fit(Kcoeff_train) classification_index = neigh.kneighbors(Kcoeff_test[i].reshape(1, -1))[1] if classification_index // 8 == i // 2: classification_count += 1 percentage_of_error = (Kcoeff_test.shape[0] - classification_count) / Kcoeff_test.shape[0] print("Classification Error over test sample for K={} coefficients is {}". format(K, 100 * percentage_of_error)) #Step7: Take some random image from test set and comput K coefficients random_test_image = np.array( Image.open(testdata_path + test_list[np.random.randint(0, len(test_list))])).flatten()
print dgts_data.shape print dgts_data dgts_lbl = pd.read_csv("abcd_l.csv",index_col=0) print dgts_lbl.head() print dgts_lbl.shape dgts_lbl = np.array(dgts_lbl) print dgts_lbl.shape print dgts_lbl #train a KNN and see how does it perform. Keep 50000 for training and 10000 for validation and 10000 for final test. gen_k_sets = StratifiedShuffleSplit(dgts_lbl, n_iter=1, test_size=0.20) mdl = SVC() mdl = rfc() dst_mdl = nn(n_neighbors=100) for train_index, test_index in gen_k_sets: train_data, test_data = dgts_data[train_index], dgts_data[test_index] train_class, test_class = dgts_lbl[train_index], dgts_lbl[test_index] #test_data= test_data[:1000,] #test_class = test_class[:1000] #print g dst_mdl.fit(train_data) #print mdl.score(train_data,train_class) print train_data.shape j = 0 for i,td in enumerate(test_data): td = np.array(td) tst_class_act=test_class[i]
q = range(0, 101, 10) perc = np.reshape(np.percentile(dgts_data, q, axis=1), (dgts_data.shape[0], len(q))) # dgts_data = np.hstack((dgts_data,perc)) dgts_lbl = pd.read_csv("abcd_l.csv", index_col=0) # print dgts_lbl.head() print dgts_lbl.shape dgts_lbl = np.array(dgts_lbl) print dgts_lbl.shape # print dgts_lbl # train a KNN and see how does it perform. Keep 50000 for training and 10000 for validation and 10000 for final test. gen_k_sets = StratifiedShuffleSplit(dgts_lbl, n_iter=1, test_size=0.15) mdl = knn(n_neighbors=9) dst_mdl = nn(n_neighbors=9) for train_index, test_index in gen_k_sets: train_data, test_data = dgts_data[train_index], dgts_data[test_index] train_class, test_class = dgts_lbl[train_index], dgts_lbl[test_index] train_class = np.reshape(np.ravel(train_class), (train_data.shape[0], 1)) print train_class.shape # test_data= test_data[:1000,] # test_class = np.reshape(test_class[:1000],(1000,1)) print test_class.shape print train_data.shape mdl.fit(train_data, train_class) # print mdl.score(train_data,train_class)
def getInput(input): # lay du lieu input roi chuan hoa for idx, c in enumerate(input): input[idx] = mt.cvlct(c) while len(input) < maxNum: input.append(zero) return [input] def printResult(indices, data): # convert ket qua tu dang chuan hoa sang du lieu doc duoc for idx in indices: for c in data[idx]: print(mt.rvlct(c) if (c != zero) else "", end="") print(" ", end="") print("") with open('Test2.csv', newline='\n') as csvfile: data = list(csv.reader(csvfile)) print("Number of words: %d" % len(data)) print("Max length: %d" % maxNum) normData(data) while True: word = getInput(input=list(input("Enter string: ").lower())) nb = nn(n_neighbors=nbNum, algorithm='brute', metric=mt.compare).fit(data) distance, indices = nb.kneighbors(word) printResult(indices=indices[0], data=data)
def main(n, CI, testFile): trainFile = 'fitData.csv' ''' trainfile = 'complete.csv' testFile = 'incomplete.csv' ''' comp_id, comp_date, comp_data2d = getData(trainFile, CI) numbers = nn(n_neighbors=n, algorithm='auto').fit(comp_data2d) if CI == True: x = '_CI_only' else: x = '' with open(testFile, 'r') as f, open( ("pred" + str(n) + x + testFile), 'w', newline='' ) as wr1: #open (("padd" + x + testFile), 'w', newline= '') as wrP: inp = csv.reader(f, skipinitialspace=True, delimiter=',', quotechar='|') out1 = csv.writer(wr1, delimiter=",", quotechar='|') #outP = csv.writer(wrP, delimiter=",", quotechar='|') out = [out1] fn = next(inp) if CI == True: d = fn.index('SavingAcnt') del fn[d:] for o in out: o.writerow(fn) test = [] date = [] id = [] x = 0 for row in inp: if CI == True: del row[d:] if x == 0: date.append(int(float(row.pop(0)))) id.append(row.pop(0)) x = id[len(id) - 1] test.append(row) elif x == row[1]: date.append(int(float(row.pop(0)))) id.append(row.pop(0)) test.append(row) else: output = [] CusID = id[0] missing_m = missingM(date) output = impute(date, id, test, numbers, comp_id, comp_data2d, fn, CusID, missing_m) #output.append(padding(CusID, missing_m, date, test, fn)) for i in range(len(output)): for ans in output[i]: out[i].writerow(ans) date, id, test = [], [], [] date.append(int(float(row.pop(0)))) id.append(row.pop(0)) test.append(row) x = id[len(id) - 1] CusID = id[0] missing_m = missingM(date) output = [] output = impute(date, id, test, numbers, comp_id, comp_data2d, fn, CusID, missing_m) #output.append(padding(CusID, missing_m, date, test, fn)) for i in range(len(output)): for ans in output[i]: out[i].writerow(ans)
def test_dimension(iter_steps, smallX, targetNeighbors): recallT = [] recallF = [] recallTU = [] recallFU = [] recallTL = [] recallFL = [] for k in range(800, 825, 5): rpb = Merp([128, 128], k, rand_type='g', target='col', tensor=False) X_train, X_test = train_test_split(smallX, test_size=0.05, train_size=0.95, random_state=23) true_neigh = nn(n_neighbors=targetNeighbors) true_neigh.fit(X_train, False) recall = 0 recalllist = [] for i in range(iter_steps): rpb.regenerate_omega() X_train_rp = rpb.transform(X_train) X_test_rp = rpb.transform(X_test) # generate predictions rp_neigh = nn(n_neighbors=targetNeighbors) rp_neigh.fit(X_train_rp) # query and calculate recall rate true_distances, true_indices = true_neigh.kneighbors( X_test, targetNeighbors) pred_distances, pred_indices = rp_neigh.kneighbors( X_test_rp, targetNeighbors) curr_recall = np.asarray([ np.intersect1d(true_indices[u], pred_indices[u]).size for u in range(X_test.shape[0]) ]) / targetNeighbors recall = recall + curr_recall.mean() recalllist.append(curr_recall.mean()) recallF.append(recall / iter_steps) recallFU.append(np.percentile(recalllist, 97.5)) recallFL.append(np.percentile(recalllist, 2.5)) for k in range(800, 825, 5): rpb = Merp([128, 128], k, rand_type='g', target='col', tensor=True) X_train, X_test = train_test_split(smallX, test_size=0.05, train_size=0.95, random_state=23) true_neigh = nn(n_neighbors=targetNeighbors) true_neigh.fit(X_train, False) recall = 0 recalllist = [] for i in range(iter_steps): rpb.regenerate_omega() X_train_rp = rpb.transform(X_train) X_test_rp = rpb.transform(X_test) # generate predictions rp_neigh = nn(n_neighbors=targetNeighbors) rp_neigh.fit(X_train_rp) # query and calculate recall rate true_distances, true_indices = true_neigh.kneighbors( X_test, targetNeighbors) pred_distances, pred_indices = rp_neigh.kneighbors( X_test_rp, targetNeighbors) # print('hello') # print(len(true_neighbors)) # print(len(pred_neighbors)) # print(len(X_test.shape)) curr_recall = np.asarray([ np.intersect1d(true_indices[u], pred_indices[u]).size for u in range(X_test.shape[0]) ]) / targetNeighbors recall = recall + curr_recall.mean() recalllist.append(curr_recall.mean()) recallT.append(recall / iter_steps) recallTU.append(np.percentile(recalllist, 97.5)) recallTL.append(np.percentile(recalllist, 2.5)) return [recallF, recallFU, recallFL, recallT, recallTU, recallTL]
from sklearn.neighbors import NearestNeighbors as nn import numpy as np # Data set X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) # Query set Y = np.array([[0, 1], [2, 3]]) nbrs = nn(n_neighbors=2, algorithm='ball_tree').fit(X) distances, indices = nbrs.kneighbors(Y) print(indices) print(distances)
def bd_scikitalgo(bd_df): try: uri ='mongodb://*****:*****@ds035315-a0.mongolab.com:35315,ds035315-a1.mongolab.com:35315/newttobackground?replicaSet=rs-ds035315' client = MongoClient(uri) newttobackground = client.newttobackground logging.info('connected') bd_cursor = newttobackground.ttoopvalcoll.find({"route":"BROOKLYN-DENVILLE"}) bdtestData = [] bdtrainData = [] bdrealTime = [] time = [] for doc in bd_cursor: bdtrainData.append([float(doc['Zone']),float(doc['Temparature'][0]),float(doc['Temparature'][1]),float(doc['CodedWeather'][0]),float(doc['CodedWeather'][1]),float(doc['CodedDay'])]) bdrealTime.append(doc['realTime']) for z,t1,t2,w1,w2,c,d in zip(bd_df['Zone'],bd_df['Temparature1'],bd_df['Temparature2'],bd_df['CodedWeather1'],bd_df['CodedWeather2'],bd_df['CodedDay'],bd_df['Date']): bdtestData.append([float(z),float(t1),float(t2),float(w1),float(w2),c]) time.append(d) logging.info("bdtrainData length %d"%(len(bdtrainData))) logging.info("bdtestData length %d"%(len(bdtestData))) neigh = nn(n_neighbors = 5) neigh.fit(bdtrainData) nn(algorithm = 'auto',metric = 'euclidean') distances = [] indexs = [] data = [] for i in bdtestData: data.append(neigh.kneighbors(i)) for i in range(len(data)): distances.append(data[i][0]) indexs.append(data[i][1]) predicted_ind = [] predicted_val = [] # we are considering the realTime in this case for i in range(len(indexs)): predicted_ind.append(indexs[i][0]) new_predicted_ind = list(chain.from_iterable(predicted_ind)) # extracting realTime of predictions for k in new_predicted_ind: predicted_val.append(bdrealTime[k]) # bdrealTime is the list where training set realTime values stored # seperating them as list of five for individual 5 neighbors listoffive = [] for i in range(0,len(predicted_val),5): listoffive.append(predicted_val[i:i+5]) prediction = [] for i in range(len(listoffive)): prediction.append(listoffive[i][0]) predictioninmins = [] for i in prediction: predictioninmins.append(float(i)/60.0) docCount = newttobackground.ttoresultcoll.find({"route":"BROOKLYN-DENVILLE"}).count() logging.info('BD -> before adding new results docCount %d'%(docCount)) lowleveldelList = [] # for the below 6hrs range highleveldelList = [] # for the regular update delete pupose brooklyndenville_time = datetime.datetime.now(pytz.timezone('US/Eastern')) brooklyndenville_dayname = brooklyndenville_time.strftime("%A") brooklyndenville_hour = int(brooklyndenville_time.strftime("%H")) brooklyndenville_minute = int(brooklyndenville_time.strftime("%M")) brooklyndenville_second = int(brooklyndenville_time.strftime("%S")) brooklyndenville_year = int(brooklyndenville_time.strftime("%Y")) brooklyndenville_month = int(brooklyndenville_time.strftime("%m")) brooklyndenville_day = int(brooklyndenville_time.strftime("%d")) presentTime = datetime.datetime(brooklyndenville_year,brooklyndenville_month,brooklyndenville_day,brooklyndenville_hour,brooklyndenville_minute,brooklyndenville_second) sixhrLimit = presentTime-datetime.timedelta(hours=6) logging.info("bd six hours back time %s"%(str(sixhrLimit))) highleveldelCursor = newttobackground.ttoresultcoll.find({"route":"BROOKLYN-DENVILLE","time" :{ "$gt":presentTime}}) lowleveldelCursor = newttobackground.ttoresultcoll.find({"route":"BROOKLYN-DENVILLE","time" :{ "$lt":sixhrLimit}}) for docid in highleveldelCursor: highleveldelList.append(docid['_id']) for docid in lowleveldelCursor: lowleveldelList.append(docid['_id']) combinedDelList = [] combinedDelList.extend(lowleveldelList) combinedDelList.extend(highleveldelList) logging.info("bd docs before sixhourslimit %d"%(len(lowleveldelList))) logging.info("bd regular update doc length %d"%(len(highleveldelList))) newttobackground.ttoresultcoll.remove({'_id':{"$in":combinedDelList}}) # Dangerous line for i in range(len(time)): doc = { "route":"BROOKLYN-DENVILLE", "time":time[i], "predictioninsecs":prediction[i], "predictioninmins":predictioninmins[i] } docid = newttobackground.ttoresultcoll.insert_one(doc) del doc docCount = newttobackground.ttoresultcoll.find({"route":"BROOKLYN-DENVILLE"}).count() logging.info('BD -> after adding new results docCount %d'%(docCount)) return True except Exception as e: logging.error("The exception occured in bd_scikit %s,%s"%(e,type(e))) return False
#dla czytelnosci nazwy - zamien taki html na pusta linie df['nazwa'] = [re.sub("<.*?>", "", text) for text in df['nazwa']] print(df) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.set_xlabel('Ocena przez kadre akademicka') ax.set_ylabel('Losy absolwentow') ax.set_zlabel('Oceny parametryczna') ax.scatter(df['opka'], df['ela'], df['op']) for index, row in df.iterrows(): ax.text(row['opka'], row['ela'], row['op'], row['nazwa']) #plt.show() X = df.iloc[:, 1:4] #iloc - integer location y = df.iloc[:, 0] nbrs = nn(n_neighbors=2) model = nbrs.fit(X) #dist, idx = model.kneighbors(X) recommendations = model.kneighbors([[100, 100, 100]], 5, return_distance=False) for idx in recommendations: print(y[idx])
def sf_scikitalgo(sf_df): try: uri ='mongodb://*****:*****@ds035315-a0.mongolab.com:35315,ds035315-a1.mongolab.com:35315/newttobackground?replicaSet=rs-ds035315' client = MongoClient(uri) newttobackground = client.newttobackground logging.info('connected') sf_cursor = newttobackground.ttoopvalcoll.find({"route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL"}) sftestData = [] sftrainData = [] sfrealTime = [] time =[] for doc in sf_cursor: sftrainData.append([float(doc['Zone']),float(doc['Temparature'][0]),float(doc['CodedWeather'][0]),float(doc['CodedDay'])]) sfrealTime.append(doc['realTime']) for z,t1,w1,c,d in zip(sf_df['Zone'],sf_df['Temparature1'],sf_df['CodedWeather1'],sf_df['CodedDay'],sf_df['Date']): sftestData.append([float(z),float(t1),float(w1),c]) time.append(d) logging.info("sftrainData length %d"%(len(sftrainData))) logging.info("sftestData length %d"%(len(sftestData))) neigh = nn(n_neighbors = 5) neigh.fit(sftrainData) nn(algorithm = 'auto',metric = 'euclidean') distances = [] indexs = [] data = [] for i in sftestData: data.append(neigh.kneighbors(i)) for i in range(len(data)): distances.append(data[i][0]) indexs.append(data[i][1]) predicted_ind = [] predicted_val = [] # we are considering the realTime in this case for i in range(len(indexs)): predicted_ind.append(indexs[i][0]) new_predicted_ind = list(chain.from_iterable(predicted_ind)) # extracting realTime of predictions for k in new_predicted_ind: predicted_val.append(sfrealTime[k]) # sfrealTime is the list where training set realTime values stored # seperating them as list of five for individual 5 neighbors listoffive = [] for i in range(0,len(predicted_val),5): listoffive.append(predicted_val[i:i+5]) prediction = [] for i in range(len(listoffive)): prediction.append(listoffive[i][0]) predictioninmins = [] for i in prediction: predictioninmins.append(float(i)/60.0) docCount = newttobackground.ttoresultcoll.find({"route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL"}).count() logging.info('SF -> before adding new results docCount %d'%(docCount)) lowleveldelList = [] # for the below 6hrs range highleveldelList = [] # for the regular update delete pupose sanfrancisco_time = datetime.datetime.now(pytz.timezone('US/Pacific')) sanfrancisco_dayname = sanfrancisco_time.strftime("%A") sanfrancisco_hour = int(sanfrancisco_time.strftime("%H")) sanfrancisco_minute = int(sanfrancisco_time.strftime("%M")) sanfrancisco_second = int(sanfrancisco_time.strftime("%S")) sanfrancisco_year = int(sanfrancisco_time.strftime("%Y")) sanfrancisco_month =int(sanfrancisco_time.strftime("%m")) sanfrancisco_day = int(sanfrancisco_time.strftime("%d")) presentTime = datetime.datetime(sanfrancisco_year,sanfrancisco_month,sanfrancisco_day,sanfrancisco_hour,sanfrancisco_minute,sanfrancisco_second) sixhrLimit = presentTime-datetime.timedelta(hours=6) logging.info("sf six hours back time %s"%(str(sixhrLimit))) highleveldelCursor = newttobackground.ttoresultcoll.find({"route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL","time" :{ "$gt":presentTime}}) lowleveldelCursor = newttobackground.ttoresultcoll.find({"route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL","time" :{ "$lt":sixhrLimit}}) for docid in highleveldelCursor: highleveldelList.append(docid['_id']) for docid in lowleveldelCursor: lowleveldelList.append(docid['_id']) logging.info("sf docs before sixhourslimit %d"%(len(lowleveldelList))) logging.info("sf regular update doc length %d"%(len(highleveldelList))) combinedDelList = [] combinedDelList.extend(lowleveldelList) combinedDelList.extend(highleveldelList) newttobackground.ttoresultcoll.remove({'_id':{"$in":combinedDelList}}) # Dangerous line for i in range(len(time)): doc = { "route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL", "time":time[i], "predictioninsecs":prediction[i], "predictioninmins":predictioninmins[i] } docid = newttobackground.ttoresultcoll.insert_one(doc) del doc docCount = newttobackground.ttoresultcoll.find({"route":"MOUNTZION RADIOLOGY CENTER-SF GENERAL HOSPITAL"}).count() logging.info('SF -> after adding new results docCount%d'%(docCount)) return True except Exception as e: logging.error("The exception for sf_scikit %s,%s"(e,type(e))) return False
from sklearn.neighbors import NearestNeighbors as nn import numpy as np from sklearn.externals.joblib import dump data = np.load('findata.npy') nbrs = nn(n_neighbors=2).fit(data) dump(nbrs, 'nbrs.joblib')