def build_vectors(articles, weights): """ Build weighted vector representations for a list of articles. """ pub_vecs, bow_vecs, con_vecs = [], [], [] for a in articles: pub_vecs.append(np.array([a.published])) bow_vecs.append(vectorize(a.text)) con_vecs.append(concept_vectorize([c.slug for c in a.concepts])) pub_vecs = normalize(csr_matrix(pub_vecs), copy=False) bow_vecs = normalize(csr_matrix(bow_vecs), copy=False) con_vecs = normalize(csr_matrix(con_vecs), copy=False) # Merge vectors. vecs = hstack([pub_vecs, bow_vecs, con_vecs]) # Convert to a scipy.sparse.lil_matrix because it is subscriptable. vecs = vecs.tolil() # Apply weights to the proper columns: # col 0 = pub, cols 1-101 = bow, 102+ = concepts # weights = [pub, bow, concept] vecs[:,0] *= weights[0] vecs[:,1:101] *= weights[1] vecs[:,101:] *= weights[2] return vecs.toarray()
def create_word_count(node_info): ''' Create word_mat matrix (num_nodes x num_unique_words) that contains the number of occurences of each word in each abstract ''' all_abstract = np.array(' '.join(node_info.abstract.as_matrix()).split()) unique_words = np.unique(all_abstract) ind_to_words_dict = dict(zip(range(len(unique_words)), unique_words)) words_to_ind_dict = dict(zip(unique_words, range(len(unique_words)))) word_mat = lil_matrix((len(node_info), len(unique_words)), dtype=np.int32) assert all(node_info.index == range(len(node_info))) # Fill matrix iteratively by looping on abstracts for (ind, abstract) in node_info.abstract.iteritems(): if ind%200 == 0: print '[Creating Word Matrix] ind={ind}'.format(ind=ind) for word in abstract.split(): word_mat[ind, words_to_ind_dict[word]] += 1 # Normalise word_mat # 0: word occurence in corpus; 1: number of words in abstract word_count = word_mat.sum(0) # (by total number of occurence of each word) word_mat_norm = normalize(normalize(word_mat, norm='l1', axis=0)) # (both) #word_mat_norm = normalize(normalize(word_mat, norm='l1', axis=0), norm='l1', axis=1) # 0: word occurence; 1: number of words return word_mat, word_mat_norm
def load(): iris = load_iris() n_samples, n_features = iris.data.shape indices = np.arange(n_samples) np.random.shuffle(indices) X = iris.data[indices] Y = iris.target[indices] split = (n_samples * 4) / 5 X_train, X_test = X[:split], X[split:] y_train, y_test = Y[:split], Y[split:] X_train = preprocessing.normalize(X_train) X_test = preprocessing.normalize(X_test) Y_train = [] for Y in y_train: if Y == 0: Y_train.append([1, 0, 0]) if Y == 1: Y_train.append([0, 1, 0]) if Y == 2: Y_train.append([0, 0, 1]) Y_test = [] for Y in y_test: if Y == 0: Y_test.append([1, 0, 0]) if Y == 1: Y_test.append([0, 1, 0]) if Y == 2: Y_test.append([0, 0, 1]) return X_train, Y_train, X_test, Y_test
def __init__(self, hps, example_list, dqn_batch_size, use_state_prime = False, max_art_oovs = 0): """ Args: hps: seq2seq model parameters example_list: list of experiences dqn_batch_size: DDQN batch size use_state_prime: whether to use the next decoder state to make the batch or the current one max_art_oovs: number of OOV tokens in current batch Properties: _x: The input to DDQN model for training, this is basically the decoder output (dqn_batch_size, dqn_input_feature_len) _y: The Q-estimation (dqn_batch_size, vocab_size) _y_extended: The Q-estimation (dqn_batch_size, vocab_size + max_art_oovs) """ self._x = np.zeros((dqn_batch_size, hps.dqn_input_feature_len)) self._y = np.zeros((dqn_batch_size, hps.vocab_size)) self._y_extended = np.zeros((dqn_batch_size, hps.vocab_size + max_art_oovs)) for i,e in enumerate(example_list): if use_state_prime: self._x[i,:]=e.state_prime else: self._x[i,:]=e.state self._y[i,:]=normalize([e.q_value[0:hps.vocab_size]], axis=1, norm='l1') if max_art_oovs == 0: self._y_extended[i,:] = normalize([e.q_value[0:hps.vocab_size]], axis=1, norm='l1') else: self._y_extended[i,:] = e.q_value
def func(A, B): # comparematrices(A,B): colA = A.shape[1] colB = B.shape[1] # method 1 - n is small dim, m is larger, matnew is new comparison matrix if colA == colB and colA != 1: Aprime = normalize(A, axis=1, norm='l2') Bprime = normalize(B, axis=1, norm='l2') if colA == 1: dist = np.linalg.norm(Aprime - Bprime) # L2 norm (vectors) else: dist = np.linalg.norm(Aprime - Bprime, 2) # Frobenius norm (matrices) else: if colA < colB: n = colA m = colB big = B small = A else: n = colB m = colA big = A small = B matnew = np.identity(m) matnew[0:n, 0:n] = small bigprime = normalize(big, axis=1, norm='l2') matnewprime = normalize(matnew, axis=1, norm='l2') dist = np.linalg.norm(matnewprime - bigprime, 2) print dist
def classify(dummy_train,dummy_test,feature_pkl,output_file): # Train classifier, iterating over subsets # Load Features print 'Loading features...' featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl) trainTargets = np.array(trainTargets) testItemIds = np.array(testItemIds) predicted_ids = [] predicted_scores = [] # SGD Logistic Regression per sample clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=5, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) for col in range(np.shape(dummy_train)[1]): # Get nonzero dummy indices as array idx_train = dummy_train[:,col].astype('bool').T.toarray()[0] print 'Training subset {} of {}...'.format(col,np.shape(dummy_train)[1]) sub_train = normalize(trainFeatures.tocsr()[idx_train,:], norm='l2', axis=0) clf.fit(sub_train,trainTargets[idx_train]) # Use probabilities instead of binary class prediction in order to generate a ranking idx_test = dummy_test[:,col].astype('bool').T.toarray()[0] sub_test = normalize(testFeatures.tocsr()[idx_test,:], norm='l2', axis=0) predicted_scores += clf.predict_proba(sub_test).T[1].tolist() predicted_ids += testItemIds[idx_test].tolist() with open(os.path.splitext(feature_pkl)[0]+'_'+output_file,'w') as out_fid: out_fid.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, predicted_ids), reverse = True): # only writes item_id per output spec, but may want to look at predicted_scores out_fid.write("%d\n" % (item_id))
def split_and_build_class(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) print X_train.shape print X_test.shape # Normalize the input data. imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0) fixed_X_train = X_train[:, 1:] imp.fit(fixed_X_train) fixed_X_train = imp.transform(fixed_X_train) preprocessing.normalize(fixed_X_train, copy=False) X_train[:, 1:] = fixed_X_train fixed_X_test = X_test[:, 1:] imp.fit(fixed_X_test) fixed_X_test = imp.transform(fixed_X_test) preprocessing.normalize(fixed_X_test, copy=False) X_test[:, 1:] = fixed_X_test train_data = read_dataset.microData() train_data.get_data(X_train) y_train = train_data.set_output(y_train) test_data = read_dataset.microData() test_data.get_data(X_test) y_test = test_data.set_output(y_test) return [X_train, X_test, y_train, y_test, train_data, test_data]
def __init__(self, epsilon_init, userNum, itemNum,k, feature_dim, tau=0.1, lambda_=0.1, init='zero', learning_rate='decay'): self.reward = 0 self.userNum = userNum self.itemNum = itemNum # self.R = np.zeros((userNum, itemNum)) self.S = np.zeros((userNum, itemNum)) self.time = 1 self.tau = tau #SGD Learning rate self.tau_init = 1 # decay self.learning_rate = learning_rate self.epsilon_init = epsilon_init if (init == 'random'): self.U = np.random.rand(userNum,k) self.V = np.random.rand(itemNum,k) else: self.U = np.zeros((userNum,k)) self.V = np.zeros((itemNum,k)) self.lambda_ = lambda_ # self.feature_dim = feature_dim #add normalization self.U = normalize(self.U, axis=1, norm='l1') self.V = normalize(self.V, axis=1, norm='l1') self.k = k # print k self.CanEstimateUserPreference = False self.CanEstimateCoUserPreference = True self.CanEstimateW = False
def read_dataset(train_size, scale=False, normalize=False): logging.info('fetching the dataset') # d = sklearn.datasets.load_diabetes() # 糖尿病 #d = sklearn.datasets.load_boston() # ボストン住宅価格 # data = d['data'].astype(np.float32) target = d['target'].astype(np.float32).reshape(len(d['target']), 1) #"Chainerのmnist.pyだと下記ののような書き方になっているが、ミニバッチの数が2以上だと動かない"らしい #target = diabetes['target'].astype(np.float32) # 本来訓練データで標準化・正規化して、そのパラメータをテストデータに適用すべき if normalize and scale: raise Exception('both normalize and scale can not be True') if normalize: data = preprocessing.normalize(data) target = preprocessing.normalize(target) if scale: data = preprocessing.scale(data) target = preprocessing.scale(target) # 分割 x_train, x_test = np.split(data, [train_size]) y_train, y_test = np.split(target, [train_size]) assert len(x_train)==len(y_train) assert len(x_test)==len(y_test) return ((x_train, y_train), (x_test, y_test), {"SHAPE_TRAIN_X":x_train.shape, "SHAPE_TRAIN_Y":y_train.shape, "SHAPE_TEST_X":x_test.shape, "SHAPE_TEST_Y":y_test.shape, })
def find_most_similar(self, bids, K_sim=14): """Return the bid of the most similar book to parameter bid except the given bid.""" termv = sparse.csc_matrix((self.M, 1), dtype=int) for bid in bids: col_num = self.bid_to_col.get(str(bid)) if col_num is not None: termv = termv + self.term_bid_matrix.getcol(col_num) if termv.nnz == 0: return () termva = termv.toarray() # Generate a vector for terms stop_words_removed = np.logical_and(termva, self.stop_words) nonzero = stop_words_removed.nonzero()[0] # Nonzero indices rest_term_rows = self.term_bid_matrix_csr[nonzero] docs = np.zeros(self.N, dtype=bool) for row in rest_term_rows: np.logical_or(docs, row.toarray()[0], docs) cols = docs.nonzero()[0] matched_matrix = self.term_bid_matrix[:,cols] termv.data = self.tf(termv.data) * np.array([self.idf(self.row_to_term[row]) for row in termv.indices]) termv = normalize(termv.T, axis=1, copy=False) matched_matrix.data = self.tf(matched_matrix.data) matched_matrix = normalize(matched_matrix.T, axis=1, copy=False).T cos_sims = termv.dot(matched_matrix).toarray()[0] found_bids = (self.col_to_bid[col] for col in cols) return islice((int(r[1]) for r in heapq.nlargest(K_sim, zip(cos_sims, found_bids)) if int(r[1]) not in bids), 9)
def normalize(self): """ impute """ print('Normalization') self.tr = normalize(self.tr) self.te = normalize(self.te)
def make_clouds(files, n_words=20): # set locations base_model_name = os.path.splitext(os.path.basename(files.model))[0] output_d = '../browser/clouds/' + base_model_name + '/' if not os.path.exists(output_d): os.makedirs(output_d) # create wordcloud generator wc = WordCloud(width=1000, height=500, background_color='white') print('Loading model') model = LdaModel.load(files.model) beta = model.expElogbeta print('Normalizing by topics, and by words') pTW = normalize(beta, axis=0) pWT = normalize(beta, axis=1) # load bug<->id map, then invert to id<-> bug bug_to_id = json.loads(open(files.replacements).read()) id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k} for i in range(len(beta)): # compute RAR t_rar = np.sqrt(pTW[i] * pWT[i]) top_word_ids = t_rar.argsort()[:-1 - n_words:-1] top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids] top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words] wc.fit_words(zip(top_words, t_rar[top_word_ids])) wc.to_file(output_d + str(i) + '.png')
def normaliser(x, option): # normalize by the norm if option == 'norm': # print 'normalize by the norm or the row' from sklearn.preprocessing import normalize x_norma = normalize(x, norm='l2') # normalize by the sum of the row, ( normalized matrix sum to 1 ) elif option == 'sum': # normalize sum to 1: #print('normalize by the sum of the row') from sklearn.preprocessing import normalize x_norma = normalize(x, norm='l1') # normalize each row by z-score : (x-mean)/std elif option == 'zscore': from scipy import stats x_norma = stats.zscore(x, axis=1) # set the nan to 0 x_norma[np.isnan(x_norma)] = 0 elif option == 'none': # print ('no normalization') x_norma = x return x_norma
def relational_retrofit(word_vecs, sparse_relations, iterations=5, verbose=True, orig_weight=1): orig_vecs = normalize(word_vecs, norm='l2', copy=False) orig_vecs *= orig_weight arbitrary_value = next(iter(sparse_relations.values())) M, k = orig_vecs.shape N = arbitrary_value.shape[0] vecs = np.zeros(shape=(N, k), dtype='f') vecs[:M] = orig_vecs sparse_list = sorted(sparse_relations.items(), key=itemgetter(0)) for iteration in range(iterations): rel_array = dense_relation_array(word_vecs, sparse_relations) next_vecs = np.zeros(shape=vecs.shape, dtype='f') for i in range(len(sparse_list)): name = sparse_list[i][0] if verbose: print('Iteration %d of %d: %s' % (iteration + 1, iterations, name)) sparse = sparse_list[i][1] dense = rel_array[i] next_vecs += sparse.dot(vecs.dot(dense.T)) normalize(next_vecs, norm='l2', copy=False) next_vecs[:M] += orig_vecs next_vecs[:M] /= 1+orig_weight vecs = next_vecs del next_vecs return vecs
def remove_outliers(image,mask): #taking the mask part to image to check the presence of bee im = cv2.bitwise_and(image,image,mask=mask); ldp_image,_,_ = ldp.ldp(im); test_Y = ldp_image.reshape((ldp_image.shape[0] * ldp_image.shape[1], ldp_image.shape[2])); test_rgb = im.reshape((im.shape[0] * im.shape[1], im.shape[2])); test = np.concatenate((test_Y,test_rgb),axis=1); mask_not = cv2.bitwise_not(mask); ret1, mask_not = cv2.threshold (mask_not,np.mean(mask_not), 255, cv2.THRESH_BINARY); im = cv2.bitwise_and(image,image,mask=mask_not); ldp_image,_,_ = ldp.ldp(im); data_ldp = ldp_image.reshape((ldp_image.shape[0] * ldp_image.shape[1], ldp_image.shape[2])); data_rgb = im.reshape((im.shape[0] * im.shape[1], im.shape[2])); data = np.concatenate((data_rgb,data_ldp),axis=1); data = data[np.any(data!=0,axis=1)]; print data.shape; data = data.astype('float64'); data = preprocessing.normalize(data,axis=0); ss = StandardScaler(); data = ss.fit_transform(data); clf = svm.OneClassSVM(nu=0.8, kernel="rbf", gamma=0.1) clf.fit(data); test = test.astype('float64'); test = preprocessing.normalize(test,axis=0); print test.shape; test = ss.fit_transform(test); test = clf.predict(test); test = test.reshape((image.shape[0] , image.shape[1])); test[test==-1] = 0; test[test==1] = 255; test = test.astype('uint8'); im = cv2.bitwise_and(image,image,mask=test); im = cv2.bitwise_and(im,im,mask=mask); #print test[:,0],test[:,1]; return(im,test);
def main(): parser = argparse.ArgumentParser() parser.add_argument("bof_histogram") args = parser.parse_args(sys.argv[1:]) print("loading bof histogram") with gzip.open(args.bof_histogram, "rb") as f: obj_hists = pickle.load(f) target_names = jsk_apc2015_common.get_object_list() # create train and test data X, y = [], [] for i, obj_name in enumerate(target_names): X.append(obj_hists[obj_name]) y += [i] * len(obj_hists[obj_name]) X = np.vstack(X) normalize(X, copy=False) y = np.array(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=np.random.randint(1234)) # train and test lgr = LogisticRegression() print("fitting LogisticRegression") lgr.fit(X_train, y_train) with gzip.open("lgr.pkl.gz", "wb") as f: pickle.dump(lgr, f) y_pred = lgr.predict(X_test) print("score lgr: {}".format(accuracy_score(y_test, y_pred))) print(classification_report(y_test, y_pred, target_names=target_names))
def rw_overlap_kernel(C1, C2): #l = 1.0/np.exp(1.0) l = 0.5 k = 0 c = 0 # reshape rows into kernel matrices M1 = np.reshape(C1, (90, 90)) M2 = np.reshape(C2, (90, 90)) # normalise so rows sum to 1 M1_norm = normalize(M1, axis=1, norm='l1') M2_norm = normalize(M2, axis=1, norm='l1') for i in range(1, 101) : M1_exp = np.linalg.matrix_power(M1_norm, i) M2_exp = np.linalg.matrix_power(M2_norm, i) #overlap = np.sum(np.minimum(M1_exp, M2_exp)) overlap = np.sum(np.sqrt(np.multiply(M1_exp, M2_exp))) #k = k + ((np.exp(-i) ) * overlap) #c = c + ((np.exp(-i)) * 90) k = k + ((l ** i) * overlap) c = c + ((l ** i) * 90) return k/c
def pagerank_undirected(graph,d): """ Args: graph - Unnormalized(or normalized) transition matrix in csr format d - page rank probability of taking the edge vs jumping to a new node Returns: page rank of each node in the graph """ normalize(graph,norm='l1',axis=1,copy=False) #normalize as transition probability per row(node) #create matrix for which to find principal eigenvector M = d*graph #weighted transition probability try: jmp_temp = (1-d)/graph.shape[0] #teleportation weighting J = np.ones(graph.shape) J = jmp_temp*J #teleportation probability matrix M = M+J except ValueError: pass #intialize normalized rankings R = np.matrix(np.random.random(graph.shape[0])).transpose() normalize(R,norm='l1',axis=0,copy=False) Rp = R*np.Inf #iterate until convergence while (np.square(R-Rp).sum() > 0.001): Rp = R R = M.dot(R) #convert R to numpy array instead of matrix before returning. return np.array(R).reshape(len(R),)
def compute_vector_features(name, series1, series2, tfidf_transform, svd, top_n_diff, results): X1 = tfidf_transform(series1) X2 = tfidf_transform(series2) dot = X1.multiply(X2).sum(axis=1) results['text_%s_dot' % name] = np.asarray(dot).reshape(-1) X1_norm = normalize(X1) X2_norm = normalize(X2) cosine = X1_norm.multiply(X2_norm).sum(axis=1) results['text_%s_cosine' % name] = np.asarray(cosine).reshape(-1) X1_svd = svd.transform(X1) X2_svd = svd.transform(X2) results['text_%s_dot_svd' % name] = (X1_svd * X2_svd).sum(axis=1) X_diff = X1_svd - X2_svd results['text_%s_euclidean_svd' % name] = (X_diff ** 2).sum(axis=1) results['text_%s_manhattan_svd' % name] = np.abs(X_diff).sum(axis=1) X1_svd = normalize(X1_svd) X2_svd = normalize(X2_svd) results['text_%s_cosine_svd' % name] = (X1_svd * X2_svd).sum(axis=1) for i in range(top_n_diff): results['text_%s_svd_diff_%d' % (name, i)] = np.abs(X_diff[:, i])
def _construct_edge_pairs(data, norm='l2', power_norm=True, dtype=np.float32): """ A tree is a list of edges, with each edge as the concatenation of the repr. of parent and child nodes. :param data: :return root, edges: """ r = data['tree'][1].astype(dtype=dtype) if power_norm: r = np.sign(r) * np.sqrt(np.abs(r)) r = preprocessing.normalize(r[np.newaxis,:], norm=norm) root = [np.squeeze(r), ] edges = [] for id in data['tree'].keys(): if id > 1: x_left = data['tree'][id].astype('float32') x_right = data['tree'][int(id/2.)].astype('float32') e = np.concatenate([x_left,x_right]) if power_norm: e = np.sign(e) * np.sqrt(np.abs(e)) e = preprocessing.normalize(e[np.newaxis,:], norm=norm) # if power_norm: # x_left = np.sign(x_left) * np.sqrt(np.abs(x_left)) # x_right = np.sign(x_right) * np.sqrt(np.abs(x_right)) # # e = (preprocessing.normalize(x_left[np.newaxis,:], norm=norm), preprocessing.normalize(x_right[np.newaxis,:], norm=norm) ) e = np.hstack(e) edges.append([np.squeeze(e),]) return root, edges
def nearest_binary_landmark(R, X): y = np.dot(R, X) y = np.transpose(y) normalize(y, copy=False) n = y.shape[0] c = y.shape[1] idx = np.argsort(y, axis=1) bs = [] ms = [] for i in range(n): b = np.zeros(c) s = 0 max_psi = 0 max_b = b m=1 for k in range(c-1, -1, -1): if y[i][idx[i][k]] <= 0: break b[idx[i][k]] = 1 s = s + y[i][idx[i][k]] psi = s / math.sqrt(c-k) if psi > max_psi: max_psi = psi max_b = np.copy(b) m = float(c-k) bs.append(max_b) ms.append(m) bs = np.array(bs) return bs, np.mean(ms), y
def difference_vectors(X, cluster_predictions, clusters_centers): PC = X.shape[1] K = len(clusters_centers) u_k = {} num_frms = X.shape[0] for frm in range(num_frms): frm_NN_cluster = cluster_predictions[frm] c = clusters_centers[frm_NN_cluster] diff = X[frm] - c if frm_NN_cluster not in u_k: u_k[frm_NN_cluster] = diff else: sum_k = u_k[frm_NN_cluster] sum_k += diff u_k[frm_NN_cluster] = sum_k vlad = u_k[0] vlad = vlad.reshape(1, vlad.shape[0]) for k in range(1, K): K_cluster = u_k[k] K_cluster = K_cluster.reshape(1, K_cluster.shape[0]) # Intra Normalization K_cluster = preprocessing.normalize(K_cluster, norm = 'l2') vlad = np.concatenate((vlad, K_cluster), axis = 1) # L2 Normalization vlad = preprocessing.normalize(vlad, norm = 'l2') return vlad
def getData_NEC(): reader = JobDBCorpus() data = reader.read_sequence_list(target='TODO') np.seterr(all='ignore') train, test = reader.train_test_data(test_size=0.2) print("Reading chunks...") chunks_train = ChunkSet(dataset=train) chunks_test = ChunkSet(dataset=test) print("Building features...") idf = featNEC.IDFeatures(dataset=train, chunkset=chunks_train) idf.build_features() ############################################################################### print("Standarizing dataset...") X_train, Y_train = getStandart(chunks_train, idf) X_test, Y_test = getStandart(chunks_test, idf) # sparse representation and normalize X_train = sparse.csr_matrix(X_train) X_train = normalize(X_train, copy=False) X_test = sparse.csr_matrix(X_test) X_test = normalize(X_test, copy=False) return X_train, Y_train, X_test, Y_test, chunks_train
def SVM_vary_train(train_images, train_labels, test_images, test_labels, kernel_type, tune1, tune2, tune3, train_amm): # reshape the training/testing data into a classifiable form train_data_mat = np.reshape(train_images, (train_images.shape[0]*train_images.shape[1],train_images.shape[3])) test_data_mat = np.reshape(test_images, (test_images.shape[0]*test_images.shape[1],test_images.shape[3])) train_data_mat = 1.0*np.array(np.mat(train_data_mat).transpose()) test_data_mat = 1.0*np.array(np.mat(test_data_mat).transpose()) # normalize the data train_data_mat = preprocessing.normalize(train_data_mat, norm='l2') test_data_mat = preprocessing.normalize(test_data_mat, norm='l2') if kernel_type is "linear": classif_1vr = svm.SVC(kernel=kernel_type, C=tune1) elif kernel_type is "rbf": classif_1vr = svm.SVC(kernel=kernel_type, gamma=tune1) elif kernel_type is "sigmoid": classif_1vr = svm.SVC(kernel=kernel_type, gamma=tune1, coef0=tune2) elif kernel_type is "poly": classif_1vr = svm.SVC(kernel=kernel_type, gamma=tune1, coef0=tune2, degree=tune3) # fit the SVM to the training set classif_1vr.fit(train_data_mat[0:train_amm,:], train_labels[0][0:train_amm]) targets = test_labels[0] # make prediction on the test data set predict = classif_1vr.predict(test_data_mat) # calculate the accuracy acc = calc_acc(targets, predict) return "kernel=" + str(kernel_type) + ", tune1=" + str(tune1) + ", tune2=" + str(tune2) + ", tune3=" + str(tune3) + ", train_amm=" + str(train_amm) + ", acc: " + str(acc) + "\n"
def classify(data_trn,lbl_trn,data_vld,lbl_vld,data_tst,lbl_tst): data_trn = normalize(data_trn,copy=False) data_vld = normalize(data_vld,copy=False) data_tst = normalize(data_tst,copy=False) # accuracy metric metric_obj = mean_squared_error ''' Train our model to predict labels for the dataset #1 ''' parameters = {'svr__gamma': 1.5, 'svr__probability': False, 'svr__epsilon': 0.4, 'svr__C': 1, 'svr__kernel': 'rbf'} cls = Pipeline([ #('feature_selection',LinearSVC()), ('svr', SVR()) ]) cls.set_params(**parameters) cls.fit(data_trn, lbl_trn) pred_vld = cls.predict(data_vld) pred_tst = cls.predict(data_tst) print ("Score for vld: %.6f" % (metric_obj(lbl_vld, pred_vld),)) print ("Score for tst: %.6f" % (metric_obj(lbl_tst, pred_tst),)) return pred_vld,pred_tst
def get_vector_sp(model, x_data, x_test): tmp_x = x_data[:, x_data.shape[1]-16:x_data.shape[1]-1] tmp_x = tmp_x.reshape(tmp_x.shape[0], tmp_x.shape[1]) tmp_x = tmp_x/tmp_x[:,0].reshape(tmp_x.shape[0],1) #tmp_x[:,1:] = tmp_x[:,1:] / tmp_x[:,0:tmp_x.shape[1]-1] tmp_x = preprocessing.normalize(tmp_x, norm='l2') preds = model.predict_proba(tmp_x.reshape(x_data.shape[0],15,1)) test_x = x_test[:, x_test.shape[1]-16:x_test.shape[1]-1] test_x = test_x.reshape(test_x.shape[0], test_x.shape[1]) test_x = test_x/test_x[0][0] test_x = preprocessing.normalize(test_x, norm='l2') pred_test = model.predict_proba(test_x.reshape(test_x.shape[0],15,1)) x_result = numpy.hstack((x_data.reshape(x_data.shape[0], x_data.shape[1]), preds)) x_result_test = numpy.hstack((x_test.reshape(x_test.shape[0], x_test.shape[1]), pred_test)) test_bdate = x_test[0][1] tmp_list = [] tmp_vec = x_result_test[0][x_result_test.shape[1]-20:] i = 0 for m in x_result: i = i + 1 dist = numpy.sqrt(numpy.sum(numpy.square(m[m.shape[0]-20:]- pred_test[0]))) tmp_list.append((m[1], dist)) sort_list = sorted(tmp_list, key = lambda x:x[1], reverse =False) return sort_list, test_bdate
def PCA_SVM(train_images, train_labels, test_images, test_labels, kernel_type, do_PCA, comps): # reshape the training/testing data into a classifiable form train_data_mat = np.reshape(train_images, (train_images.shape[0]*train_images.shape[1],train_images.shape[3])) test_data_mat = np.reshape(test_images, (test_images.shape[0]*test_images.shape[1],test_images.shape[3])) train_data_mat = np.array(np.mat(train_data_mat).transpose()) test_data_mat = np.array(np.mat(test_data_mat).transpose()) # normalize the data train_data_mat = preprocessing.normalize(train_data_mat, norm='l2') test_data_mat = preprocessing.normalize(test_data_mat, norm='l2') # do PCA if necessary if do_PCA: # learn the covariance pca = PCA(n_components=comps, whiten=True) pca.fit(train_data_mat) # use pca to reduce dimensionality of training data train_data_mat = pca.transform(train_data_mat) test_data_mat = pca.transform(test_data_mat) # fit svm to pca-reduced classif_1vr = svm.SVC(kernel=kernel_type) classif_1vr.fit(train_data_mat, train_labels[0]) targets = test_labels[0] # make prediction on the test data set predict = classif_1vr.predict(test_data_mat) # calculate the accuracy acc = calc_acc(targets, predict) return "PCA=" + str(do_PCA) + ", num_comps= " + str(comps) + ", kernel=" + str(kernel_type) + ", acc: " + str(acc) + "\n"
def rwr(transition,PT,r=0.7): """ :param transition: Get the spare Transition matrix :param PT: Intialization Vector :param r: restart probability :return: Numpy Matrix of predicted scores """ #Stop criteria stop = 1e-07 PO = PT Tr = transition while True: PX = (1-r)* Tr.T * PT + (r * PO) delta = spnorm(PX) - spnorm(PT) if delta < stop : break PT = PX #fMat = normalize(PT, norm='l1', axis=0) OM = PT[0:5080] OM = normalize(OM, norm='l1', axis=0) PP = PT[5080:15078] PP = normalize(PP, norm='l1', axis=0) CP = PT[15078:16904] CP = normalize(CP, norm='l1', axis=0) PAT = PT[16904:19435] PAT = normalize(PAT, norm='l1', axis=0) P = np.concatenate((OM,PP,CP,PAT),axis=0) return P
def getData_NEC(test=0.2, val=0.2, mode='by_sent',target='TODO'): print("Reading data...") train,test,val = getData(test=test, val=val, mode=mode,target=target) print("Reading chunks...") chunks_train = ChunkSet(dataset=train) chunks_test = ChunkSet(dataset=test) print("Building features...") idf = featNEC.IDFeatures(dataset = train, chunkset = chunks_train) idf.build_features() ############################################################################### print("Standarizing dataset...") X_train,Y_train = getStandart(chunks_train, idf) X_test,Y_test = getStandart(chunks_test, idf) # sparse representation and normalize X_train = sparse.csr_matrix(X_train) X_train = normalize(X_train, copy = False) X_test = sparse.csr_matrix(X_test) X_test = normalize(X_test, copy = False) return X_train,Y_train,X_test,Y_test, chunks_train
def classify(df): print type(df) df = preprocessing.normalize(df.ix[:,[1,2,3]], norm="l2") print df print type(df) #load dataset dataset_train = pd.read_csv("training.csv",delimiter=",") X = dataset_train.ix[:,[0,1,2]] X = X.as_matrix() X = preprocessing.normalize(X,norm="l2") print X print type(X) #X_normalised = preprocessing.normalize(X,norm="l2") #ground truth y = dataset_train.ix[:,[3]] y = y.as_matrix() y = y.ravel() print y print type(y) #y = y.reshape((len(y),)) n_neighbors = 3 #create an instance of Neighbours Classifier and fit the data. knn = neighbors.KNeighborsClassifier(n_neighbors) knn.fit(X, y) #print df[:,0:4] return knn.predict(df[:,0:4])
def frame_callback(vis, frame_idx, seq_info, viewer,angle): print("Processing frame %05d" % frame_idx) # Load image and generate detections. detections = create_detections( seq_info["detections"], frame_idx, min_detection_height) detections = [d for d in detections if d.confidence >= min_confidence] # Run non-maxima suppression. boxes = np.array([d.tlwh for d in detections]) scores = np.array([d.confidence for d in detections]) indices = preprocessing.non_max_suppression( boxes, nms_max_overlap, scores) detections = [detections[i] for i in indices] image = cv2.imread( seq_info["image_filenames"][frame_idx], cv2.IMREAD_COLOR) # Update tracker. tracker[angle-1].predict() tracker[angle-1].update(detections, image, myexactor, dic_feature, frame_idx, global_next_id, angle) id_dic = {} for index, track in enumerate(tracker[angle-1].tracks): str_id = str(track.track_id) if str_id in id_dic.keys(): track.track_id = global_next_id[0] global_next_id[0] += 1 else: id_dic[str_id] = (index, track.state) # Update visualization. if display: vis.set_image(image.copy(),viewer,angle) #print("deep_sort angle: "+str(angle)) vis.draw_detections(detections,viewer,angle) vis.draw_trackers(tracker[angle-1].tracks,viewer,angle) # Store results. for track in tracker[angle-1].tracks: if not track.is_confirmed() or track.time_since_update > 1: continue bbox = track.to_tlwh() print(angle) if angle == 1: print("angle1") results1.append([ frame_idx, track.track_id, bbox[0], bbox[1], bbox[2], bbox[3], track.sex, track.person_age]) if angle == 2: print("angle2") results2.append([ frame_idx, track.track_id, bbox[0], bbox[1], bbox[2], bbox[3], track.sex, track.person_age]) if angle == 3: print("angle3") results3.append([ frame_idx, track.track_id, bbox[0], bbox[1], bbox[2], bbox[3], track.sex, track.person_age]) ### save gallery if (frame_idx) % 4 == 0: for i in range(4): if bbox[i] < 0 : bbox[i] = 0 img = image[int(bbox[1]):int(bbox[1] + bbox[3]), int(bbox[0]):int(bbox[0] + bbox[2])] img = cv2.resize(img, (128, 256), interpolation=cv2.INTER_CUBIC) temp = img.copy() transform_test = T.Compose([ T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) img = transform_test(img) img = torch.unsqueeze(img, 0) ##sex age forward track_id = track.track_id img = img.cuda() sex_output, age_output = model_sex_age(img) pred1 = sex_output.data.max(1)[1] pred2 = age_output.data.max(1)[1] age = age_dict[str(int(pred2))] sex = sex_dict[str(int(pred1))] track.person_age = age track.sex = sex ##end f1 = myexactor(img) a1 = normalize(pool2d(f1[0], type='max')) if str(track_id) not in dic_feature.keys(): dic_feature[str(track_id)] = [] dic_feature[str(track_id)].append((a1, angle, (bbox[0]+bbox[2]*0.5, bbox[1]+bbox[3]))) else: if len(dic_feature[str(track_id)]) > 100: del(dic_feature[str(track_id)][0]) dic_feature[str(track_id)].append((a1, angle, (bbox[0]+bbox[2]*0.5, bbox[1]+bbox[3])))
# In[32]: from sklearn import preprocessing sim_user = [] #add the given user vector to a frequency list of all users fVectorOfGivenUser_listForm = fVectorOfGivenUser.tolist() sim_user.append(fVectorOfGivenUser_listForm) for first_user in most_sim_users: sim_user.append(freqList[userIds.index(first_user[1])]) #normalise the vectors using l2 norm normalizedSimilarUsers = preprocessing.normalize(sim_user, norm='l2') # # contributing term calculation # In[33]: top3Terms = [] print("The top 3 contributing terms for each match are: \n") for each_k in range(1, int(k) + 1): print("For match " + str(each_k) + ":") diffVector = [] for j in range(len(vocab)): if (normalizedSimilarUsers[0][j] != 0 and normalizedSimilarUsers[each_k][j] != 0): diffVector.append([
feature_file = sys.argv[1] output_file = sys.argv[3] cluster_num = int(sys.argv[2]) if feature_file.split('.')[1] == 'surf': feature_name = 'surf' print('SURF features') elif feature_file.split('.')[1] == 'cnn': feature_name = 'cnn' print('CNN features') else: raise (ValueError('Invalid data')) # Read data X = np.loadtxt(feature_file, delimiter=';') #X = np.genfromtxt(mfcc_csv_file, delimiter=";") print(X.shape) X = normalize(X, axis=1) # Model fit to data #kmeans = KMeans(n_clusters=cluster_num) kmeans = MiniBatchKMeans( n_clusters=cluster_num, batch_size=50, random_state=0, init_size=500) #Convential KMeans is too slow #Potential Memory Error kmeans.fit(X) # Save model pickle.dump(kmeans, open(output_file, 'wb')) print("K-means trained successfully!")
loader=np.load(filename) data=loader['data'] indptr=loader['indptr'] indices=loader['indices'] shape=loader['shape'] return csr_matrix((data,indices,indptr),shape) tf_idf=csr_load(filename) print tf_idf[0,:] tf_idf.shape fl=open('E:/Clustering and Retreival/Week 6/people_wiki_map_index_to_word.json') map_index_to_word=json.load(fl) fl.close() tf_idf=normalize(tf_idf) type(wiki.name) def bipartition(cluster,maxiter=400,num_runs=4,seed=None): dataframe=cluster['dataframe'] data_matrix=cluster['data_matrix'] km=KMeans(n_clusters=2,n_init=num_runs,max_iter=maxiter,random_state=seed) km.fit(data_matrix) centroids,cluster_ass=km.cluster_centers_,km.labels_ data_matrix_left_child,data_matrix_right_child=data_matrix[cluster_ass==0,:],data_matrix[cluster_ass==1,:] cluster_ass=pd.Series(cluster_ass) data_frame_left,data_frame_right=dataframe.loc[cluster_ass==0,:],dataframe.loc[cluster_ass==1,:] cluster_left={'dataframe':data_frame_left, 'data_matrix':data_matrix_left_child, 'centroids':centroids[0]}
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, sample_weights=None, random_state=None, tol=1e-4, precompute_distances=True): ''' Modified from sklearn.cluster.k_means_.k_means_single_lloyd. ''' random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment # TODO: _labels_inertia should be done with cosine distance # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized # this doesn't really matter. labels, inertia = \ _labels_inertia(X, sample_weights, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means if sp.issparse(X): centers = _k_means._centers_sparse(X, sample_weights, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, sample_weights, labels, n_clusters, distances) # l2-normalize centers (this is the main contibution here) centers = normalize(centers) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1
X[neg_ind] = 0. spec_err[neg_ind] = 0. #%% Set zero fluxes to NaN X_nonan = X.copy() zero_ind = X == 0. X[zero_ind] = np.NaN #%% Set all zero and negative flux errors to NaN zero_err_ind = spec_err <= 0. spec_err[zero_err_ind] = np.NaN #%% Normalise spectrum X_normal, norm = preprocessing.normalize(X_nonan, return_norm=True) X_norm_zeros = np.copy(X_normal) #%% Plot an example spectrum in the data plt.figure() plt.plot(wavelengths, X_normal[4]) plt.show() #%% Set all zero normalised fluxes to nan zero_norm_ind = X_normal == 0. X_normal[zero_norm_ind] = np.NaN #%% Transform errors due to corresponding normalisation spec_err_T = np.transpose(spec_err) spec_err_norm_T = np.divide(spec_err_T, norm) spec_err_norm = np.transpose(spec_err_norm_T)
return rmse(y_pred, y_actual) / (max(y_actual) - min(y_actual)) with open("1_train.txt", "r") as file: for line in file: newline = line.rstrip('\n') X_train.append([int(i) for i in newline.split(' ')]) Y_train += [X_train[len(X_train) - 1].pop()] with open("1_test.txt", "r") as file: for line in file: newline = line.rstrip('\n') X_test.append([int(i) for i in newline.split(' ')]) Y_test += [X_test[len(X_test) - 1].pop()] X_train = preprocessing.normalize(X_train) X_test = preprocessing.normalize(X_test) Y_train, Y_test = np.array(Y_train), np.array(Y_test) np.hstack((X_train, np.ones((X_train.shape[0], 1)))) np.hstack((X_test, np.ones((X_test.shape[0], 1)))) # modelLS = Ridge(alpha=0.5, solver='svd') # modelLS.fit(X_train, Y_train) # y = modelLS.predict(X_train) # error1 = nrmse(y, Y_train) # print(error1) # y_pred = modelLS.predict(X_test) # error2 = nrmse(y_pred, Y_test) # print(error2) # for i in range(100, 300, 50): # modelGD = SGDRegressor(shuffle=True, max_iter=i, penalty="elasticnet", alpha=0.01, learning_rate="invscaling", eta0=0.001, l1_ratio=0.6, power_t=0.3)
## TSNE from sklearn.manifold import TSNE ts = TSNE(n_components=2).fit_transform(data1) plt.scatter(ts[:, 0], ts[:, 1], c=kmeans_model.fit_predict(data1)) plt.show() ## percent labels = kmeans.labels_ percent_clas = [] for i in range(kmeans.n_clusters): percent_clas.append(sum(labels == i) / len(labels)) print(percent_clas) ## Hierachical Clustering from sklearn import preprocessing data1 = preprocessing.normalize(data1) from sklearn.cluster import AgglomerativeClustering import time start = time.time() hpre = AgglomerativeClustering(n_clusters=3).fit_predict(data1) end = time.time() print(end - start) hfit = AgglomerativeClustering(n_clusters=3).fit(data1[0:1000, :]) from sklearn.manifold import TSNE ts = TSNE(n_components=2).fit_transform(data1) plt.scatter(ts[:, 0], ts[:, 1], c=hpre) plt.show() from sklearn.manifold import MDS
116117885893.23831, 92005035565.86392, 48313784253.98602, 4578303196.450315, -28703495471.202255, -57784000679.98717, -78326916097.10924, -91205023418.17476 ] final = np.convolve(signal,a, 'same') print(final.shape) final = normalize([final],axis=1) print(final.shape) ax1.plot(signal) ax2.plot(final) print(np.max(final)) sp = np.fft.fft(final) freq = np.fft.fftfreq(final.shape[-1]) #ax2.plot(freq, sp.real) #wavfile.write('final.wav',44100,final)
def extract_features(): directories = os.listdir(path="dataset_splitted") dataset = [] kernels = [] for theta in range(4): theta = theta / 4. * np.pi for sigma in (1, 3): for frequency in (0.05, 0.25): kernel = np.real( gabor_kernel(frequency, theta=theta, sigma_x=sigma, sigma_y=sigma)) kernels.append(kernel) load_full_features = True if load_full_features == False: for type_name in ('/train/', '/validation/', '/test/'): i = 1 for dir in directories: for file in os.listdir("dataset_splitted/" + dir + type_name): descriptors = [] img = cv2.imread( "dataset_splitted/" + dir + type_name + file, cv2.IMREAD_GRAYSCALE) img_color = cv2.imread("dataset_splitted/" + dir + type_name + file) feature_extractor = LocalBinaryPatterns(128, 1) descriptor_template = feature_extractor.describe(img) descriptor_template_norm = preprocessing.normalize( np.array(descriptor_template).reshape(1, -1))[0] descriptors.append(descriptor_template_norm) feature_extractor_2 = Gradient_histogram(128) descriptor_template_2 = feature_extractor_2.describe(img) descriptor_template_2_norm = preprocessing.normalize( np.array(descriptor_template_2).reshape(1, -1))[0] descriptors.append(descriptor_template_2_norm) descriptor_template_3 = compute_feats(img, kernels) descriptor_template_3_norm = preprocessing.normalize( np.array(descriptor_template_3).reshape(1, -1))[0] descriptors.append(descriptor_template_3_norm) feature_extractor_4 = LocalBinaryPatterns(128, 2) descriptor_template_4 = feature_extractor_4.describe(img) descriptor_template_4_norm = preprocessing.normalize( np.array(descriptor_template_4).reshape(1, -1))[0] descriptors.append(descriptor_template_4_norm) for method in ('RGB', 'HSV', 'LAB'): for hist in hist_describe(img_color, method): descriptor_template_5_norm = preprocessing.normalize( np.array(hist).reshape(1, -1))[0] descriptors.append(descriptor_template_5_norm) d_filename = file descriptors.append([d_filename]) descriptors.append([i]) flat_descriptors = [] for sublist in descriptors: for item in sublist: flat_descriptors.append(item) dataset.append(flat_descriptors) print(i) i += 1 dataset_arr = np.array([np.array(xi) for xi in dataset], dtype=object) x = dataset_arr[:, :-2] y = dataset_arr[:, -1] # print(np.unique(y, return_counts=True)) names = dataset_arr[:, -2] np.savetxt('extracted_x.csv', x, fmt='%s', delimiter=',') np.savetxt('extracted_y.csv', y, fmt='%s', delimiter=',') np.savetxt('extracted_names.csv', names, fmt='%s', delimiter=',') else: x = np.loadtxt('extracted_x.csv', dtype='float', delimiter=',') y = np.loadtxt('extracted_y.csv', dtype='str', delimiter=',') names = np.loadtxt('extracted_names.csv', dtype='str', delimiter=',') # dtype = 'float' # df_X = pd.DataFrame.from_records(X) # Shape (1600, 546) # y = dataset_arr[:, -1] # x = dataset_arr[:, :-2] # names = dataset_arr[:, -2] print(x.shape) # pca = PCA(n_components=600).fit(x) # print(pca.explained_variance_ratio_) # x = pca.transform(x) # print(x) x_train = x[0:800, :] x_validation = x[800:1120, :] x_test = x[1120:1600, :] print(x_train.shape) y_train = y[0:800] y_validation = y[800:1120] y_test = y[1120:1600] classify = True if classify == True: predictions_val_arr = [] predictions_test_arr = [] # test и validation перепутаны from sklearn.neighbors import KNeighborsClassifier clf1 = KNeighborsClassifier(n_neighbors=13) from sklearn.tree import DecisionTreeClassifier clf2 = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=2, max_leaf_nodes=150, random_state=0) from sklearn.ensemble import RandomForestClassifier clf3 = RandomForestClassifier(criterion='entropy', max_depth=20, min_samples_leaf=2, min_samples_split=4, random_state=0) from sklearn.ensemble import GradientBoostingClassifier clf4 = GradientBoostingClassifier(random_state=0) from sklearn.neural_network import MLPClassifier clf5 = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(128, ), random_state=1) from sklearn.ensemble import AdaBoostClassifier clf6 = AdaBoostClassifier(n_estimators=2000, random_state=0) classifiers = (clf1, clf2, clf3, clf4, clf5, clf6) # for clf in classifiers: # # clf.fit(x_train, y_train) # # predictions_val = clf.predict(x_test) # predictions_val_arr.append(predictions_val) # # predictions_test = clf.predict(x_validation) # predictions_test_arr.append(predictions_test) # # predictions_train = clf.predict(x_train) # # print(classification_report(y_train, predictions_train)) # print(classification_report(y_test, predictions_val)) from sklearn.ensemble import VotingClassifier voting_clf = VotingClassifier(estimators=[('1', clf1), ('2', clf2), ('3', clf3), ('4', clf4), ('5', clf5), ('6', clf6)], voting='hard') print("Created voting classifier") voting_clf.fit(x_train, y_train) predictions_val = voting_clf.predict(x_test) predictions_val_arr.append(predictions_val) predictions_test = voting_clf.predict(x_validation) predictions_test_arr.append(predictions_test) predictions_train = voting_clf.predict(x_train) # print(classification_report(y_train, predictions_train)) print(classification_report(y_test, predictions_val)) print(confusion_matrix(y_test, predictions_val)) print(predictions_val_arr) ml_preprocessed_val = [] ml_preprocessed_test = [] i = 0 for j in predictions_val_arr: for a in range(16): ml_preprocessed_val.append([]) for prediction in j: for a in range(16): ml_preprocessed_val[i * 16 + a].append( int(int(prediction) == a)) i += 1 i = 0 for j in predictions_test_arr: for a in range(16): ml_preprocessed_test.append([]) for prediction in j: for a in range(16): ml_preprocessed_test[i * 16 + a].append( int(int(prediction) == a)) i += 1 ml_preprocessed_val = np.array(ml_preprocessed_val) ml_preprocessed_test = np.array(ml_preprocessed_test) # print(ml_preprocessed_val) # print(ml_preprocessed_val.shape) # # print(ml_preprocessed_test) # print(ml_preprocessed_test.shape) np.savetxt('ml_preprocessed_val.csv', ml_preprocessed_val, fmt='%s', delimiter=',') np.savetxt('ml_preprocessed_test.csv', ml_preprocessed_test, fmt='%s', delimiter=',') else: ml_preprocessed_val = np.loadtxt('ml_preprocessed_val.csv', dtype='float', delimiter=',') ml_preprocessed_test = np.loadtxt('ml_preprocessed_test.csv', dtype='float', delimiter=',')
import string import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') print(os.getcwd()) os.chdir(".\\data") df = pd.read_json("tweeps.json", "r", lines=True) data = df["text"] tf_idf_vectorizor = TfidfVectorizer( stop_words='english', #tokenizer = tokenize_and_stem, max_features=20000) tf_idf = tf_idf_vectorizor.fit_transform(data) tf_idf_norm = normalize(tf_idf) tf_idf_array = tf_idf_norm.toarray() pd.DataFrame(tf_idf_array, columns=tf_idf_vectorizor.get_feature_names()).head() class Kmeans: """ K Means Clustering Parameters ----------- k: int , number of clusters seed: int, will be randomly set if None
def fisher_vector(xx, gmm, normalization=True): """ Computes the Fisher vector on a set of descriptors. code from : https://gist.github.cnsom/danoneata/9927923 Parameters ---------- xx: array_like, shape (N, D) or (D, ) The set of descriptors gmm: instance of sklearn mixture.GMM object Gauassian mixture model of the descriptors. Returns ------- fv: array_like, shape (K + 128 * D * K, ) Fisher vector (derivatives with respect to the mixing weights, means and variances) of the given descriptors. Reference --------- Sanchez, J., Perronnin, F., Mensink, T., & Verbeek, J. (2013). Image classification with the fisher vector: Theory and practice. International journal of computer vision, 105(64), 222-245. https://hal.inria.fr/hal-00830491/file/journal.pdf """ xx = np.atleast_2d(xx) n_points = xx.shape[0] D = gmm.means_.shape[1] tiled_weights = np.tile(np.expand_dims(gmm.weights_, axis=-1), [1, D]) #start = time.time() # Compute posterior probabilities. Q = gmm.predict_proba(xx) # NxK #mid = time.time() #print("Computing the probabilities took ", str(mid-start)) #Compute Derivatives # Compute the sufficient statistics of descriptors. s0 = np.sum(Q, 0)[:, np.newaxis] / n_points s1 = np.dot(Q.T, xx) / n_points s2 = np.dot(Q.T, xx**2) / n_points d_pi = (s0.squeeze() - n_points * gmm.weights_) / np.sqrt(gmm.weights_) d_mu = (s1 - gmm.means_ * s0) / np.sqrt(tiled_weights * gmm.covariances_) d_sigma = (+s2 - 2 * s1 * gmm.means_ + s0 * gmm.means_**2 - s0 * gmm.covariances_) / (np.sqrt(2 * tiled_weights) * gmm.covariances_) #Power normaliation alpha = 0.5 d_pi = np.sign(d_pi) * np.power(np.absolute(d_pi), alpha) d_mu = np.sign(d_mu) * np.power(np.absolute(d_mu), alpha) d_sigma = np.sign(d_sigma) * np.power(np.absolute(d_sigma), alpha) if normalization == True: d_pi = normalize(d_pi[:, np.newaxis], axis=0).ravel() d_mu = normalize(d_mu, axis=0) d_sigma = normalize(d_sigma, axis=0) # Merge derivatives into a vector. #print("comnputing the derivatives took ", str(time.time()-mid)) return np.hstack((d_pi, d_mu.flatten(), d_sigma.flatten()))
from sklearn.decomposition import PCA as PCA import scipy.io import numpy as np import inspect import sys sys.path.append('/home/ov/python/py_utils') import utils patient_nr = 10 scid = 'nsc' pwd = '/home/ov/preprocessed_waves/extracted/' pwd_means = pwd + 'nfpat' + str(patient_nr) + '.icp.' + scid + '_means' cluster_means = np.loadtxt(pwd_means) print(cluster_means.shape) # Normalization: from sklearn.preprocessing import normalize cluster_means = normalize(cluster_means, norm='max') fig = utils.plot_means_a4(cluster_means, title='PAT {} {} (pa)'.format( patient_nr, scid)) # pa := postapocalyptic
drop_col_names = [] vifs = list(vif_df.VIF) predictors = list(vif_df.Ind_Var) for i in range(len(predictors)): if vifs[i] >= 10: drop_col_names.append(predictors[i]) df = df.drop(drop_col_names, 1) # this is the data frame with high VIF variables removed X = df.drop('G3', 1) # this is the design matrix y = list(df.G3) # this is the discrete response vector y_new = response_conv(y) # this is the multinomial response vector X_scale = preprocessing.scale(X) X_norm = preprocessing.normalize(X) ######################################################################################################################## X1_train, X1_test, y1_train, y1_test = train_test_split(X, y_new, test_size=0.33, random_state=42) X2_train, X2_test, y2_train, y2_test = train_test_split(X_scale, y_new, test_size=0.33, random_state=42) X3_train, X3_test, y3_train, y3_test = train_test_split(X_norm, y_new, test_size=0.33, random_state=42) log_reg1 = LogisticRegressionCV(cv=10, scoring='neg_log_loss',
def net(self, x, classify=False, miecorr=False, predict=False, train=False, show=False): import keras from keras.layers import Input, Dense from keras.optimizers import RMSprop, Adam, SGD from keras.models import model_from_json from keras.callbacks import ModelCheckpoint #import tensorflow as tf #import tensorflow.compat.v1 as tf #tf.disable_v2_behavior() """ TODO: UPDATE TO TENSORFLOW VERSION 2 """ #################################################################################################### # DETERMINE WICH MODEL PARAMETERS YOUN WNAT TO USE # CLASSIFY == TRUE GIVES THE MODEL TRAINED TO CLASSIFY ALL CELLUAR COMPONENTS BASED ON SPECTRA # BETWEEN 950-1800 WVN # # MIECORR == TRUE GIVES THE CORRESPONDING NEURAL NETWORK FOR PERFORMING EFFICIENT RMIE-CORRECTION # ON FFPE-BASED TISSUE SPECTRA # #################################################################################################### if classify == True: if x.shape[1] != 450: raise ValueError( 'This is a classification problem: Your spectral data needs 450 datapoints in WVN range of 950-1800 1/cm' ) json_file = open( os.path.join( str(MODELPATH) + '/model_weights_classification.json'), 'r') loaded_model_json = json_file.read() loaded_model = model_from_json(loaded_model_json) if show == True: print(loaded_model.summary()) loaded_model.load_weights( os.path.join( str(MODELPATH) + "/model_weights_classification.best.hdf5")) print("Loaded model from disk") model = loaded_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) from sklearn.preprocessing import normalize trX = normalize(x, axis=1, norm='l2') return loaded_model.predict(trX), load_model if miecorr == True: if x.shape[1] != 909: raise ValueError( 'This is a regression problem: Your spectral data needs 909 datapoints in WVN range of 950-2300 1/cm' ) #################################################################################################### # THIS MODEL NEEDS THE FIRST 909 WVN. RANGE FROM 950-2300 WVN 1/cm # # # ####################################################################################################x json_file = open( os.path.join( str(MODELPATH) + '/model_weights_regression.json'), 'r') loaded_model_json = json_file.read() loaded_model = model_from_json(loaded_model_json) if show == True: print(loaded_model.summary()) loaded_model.load_weights( os.path.join( str(MODELPATH) + "/model_weights_regression.best.hdf5")) print("Loaded model from disk") loaded_model.compile(loss='mean_squared_error', optimizer='adam') from sklearn.preprocessing import normalize trX = normalize(x, axis=1, norm='l2') return loaded_model.predict(trX), load_model
counts = np.zeros((len(words), len(words))) file2 = nltk.data.path[0] + '/corpora/brown/brown_100.txt' with open(file2) as g: # starting_word = '<s>' for line in g: wdsEachLine = line[:-1].lower().split() # wdsEachLine = [starting_word] + wdsEachLine wdsEachLine.append('</s>') for idx, wd in enumerate(wdsEachLine): if idx != 0: counts[words[wdsEachLine[idx]]][words[wdsEachLine[idx - 1]]] += 1 counts += 0.1 from sklearn.preprocessing import normalize probs = normalize(counts, norm='l1', axis=0) # Write them into a file target = open('smooth_probs.txt', 'w') target.write('p(the|all) = ' + str(probs[words['the']][words['all']]) + '\n') target.write('p(jury|the) = ' + str(probs[words['jury']][words['the']]) + '\n') target.write('p(campaign|the) = ' + str(probs[words['campaign']][words['the']]) + '\n') target.write('p(calls|anonymous) = ' + str(probs[words['calls']][words['anonymous']]) + '\n') target.close() file3 = nltk.data.path[0] + '/corpora/brown/toy_corpus.txt' target1 = open('smoothed_eval.txt', 'w') with open(file3) as h: for line in h: wdsInSentence = line[:-1].lower().split() # wdsInSentence = ['<s>'] + wdsInSentence
#Visualising the distributions of the different variables sns.pairplot(df_vpp_sd) # From the pair plot above, we can clearly see that many of the variables have either positively or negatively skewed distributions. There also appears to be many outliers in each distribution and the different parameters don't share a common scale. # # To remedy this and also to ensure sound clustering analysis, we are going to **standardise** the dataset. We may also perform some principal component analysis on the data and to allow for this, the data will need to be standardised anyway. # #### Standardising the features # In[39]: #Scaling the data array_vpp_scaled = StandardScaler().fit_transform(df_vpp_sd) #Normalizing the data array_vpp_norm = normalize(array_vpp_scaled) #Converting the standardised array back to a DataFrame df_vpp_sd = pd.DataFrame(array_vpp_norm, columns=df_vpp_sd.columns) # In[40]: df_vpp_sd.describe() # #### Add the group feature back in # Since the data is now fully processed and ready for clustering, we can add the *'group'* column back in as it will be needed later on. # In[41]: df_vpp_sd = pd.concat([df_vpp.iloc[:, 0], df_vpp_sd], axis=1)
def transfer(self, x, y, batch, train_epochs, add_l=[], classify=False, miecorr=False, trainable=False): import keras from keras.models import Model from keras.optimizers import RMSprop, Adam, SGD from keras.models import model_from_json from keras.callbacks import ModelCheckpoint from keras.models import Sequential from datetime import datetime from sklearn.preprocessing import normalize """ ALL PARTS OF THE TRANSFER-LEARNING NETWORKS ON FTIR SPECTROSCOPIC DATA """ trX = normalize(x, axis=1, norm='l2') def onehot(y): import keras from keras.utils import np_utils c = np.max(y) + 1 y1hot = np_utils.to_categorical(y, num_classes=c) return (y1hot) def add_layer(): from keras.utils import np_utils from keras.layers import Input, Dense from keras.models import Model from keras import models yoh = onehot(y) sm = int(yoh.shape[1]) print("training on", sm, "classes") json_file = open( os.path.join( str(MODELPATH) + '/model_weights_classification.json'), 'r') loaded_model_json = json_file.read() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights( os.path.join( str(MODELPATH) + "/model_weights_classification.best.hdf5")) if trainable == False: for layer in loaded_model.layers: layer.trainable = False else: for layer in loaded_model.layers: layer.trainable = True if not add_l: preds = Dense(sm, name='newlast', activation='softmax')( loaded_model.layers[-3].output) model = Model(inputs=loaded_model.input, outputs=preds) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) history = model.fit(trX, yoh, batch_size=batch, epochs=train_epochs) print(model.summary()) if add_l: def add_2_model(add_l): base = Model(inputs=loaded_model.input, outputs=loaded_model.layers[-3].output) model = Sequential() model.add(base) model.add(Dense(add_l[0], input_dim=450, activation='relu')) for layer_size in add_l[1:]: model.add(Dense(layer_size, activation='relu')) model.add(Dense(sm, activation='softmax')) return model model = add_2_model(add_l) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) history = model.fit(trX, yoh, batch_size=batch, epochs=train_epochs) print(model.summary()) dtstr = datetime.now().strftime("%d-%m-%Y_%I-%M-%S_%p") model_json = model.to_json() with open("model_ptMLP_class_" + dtstr + ".json", "w") as json_file: json_file.write(model_json) model.save_weights("model_model_ptMLP_class_" + dtstr + ".h5") print("Saved model to disk to", "model_model_ptMLP_class_" + dtstr + ".json") print("and weights to") print("Saved model to disk to", "model_model_ptMLP_class_" + dtstr + ".h5") ###########################PLOTTING########################## history_dict = history.history history_dict.keys() a = np.array(history_dict['acc']) print(a.shape) l = np.array(history_dict['loss']) e = range(1, len(a) + 1) plt.plot(e, a, 'bo', color='red', label='Acc Training') plt.plot(e, l, 'b', label='Loss Training') plt.xlabel('Epochs') plt.legend() plt.savefig('model.pdf') return (model, history_dict) def simple_val_of_data(x, y): from sklearn.model_selection import train_test_split from random import randrange from sklearn.preprocessing import normalize trX = normalize(x, axis=1, norm='l2') seed = randrange(999) print('used random seed was', seed) x_train, x_test, y_train, y_test = train_test_split( trX, y, test_size=0.4, random_state=seed) return x_train, x_test, y_train, y_test def train_layer(): from keras.utils import np_utils from keras.layers import Input, Dense from keras.models import Model from keras import models sm = int(y.shape[1]) json_filer = open( os.path.join( str(MODELPATH) + '/model_weights_regression.json'), 'r') loaded_model_jsonr = json_filer.read() loaded_modelr = model_from_json(loaded_model_jsonr) loaded_modelr.load_weights( os.path.join( str(MODELPATH) + "/model_weights_regression.best.hdf5")) if trainable == False: for layer in loaded_modelr.layers: layer.trainable = False else: for layer in loaded_modelr.layers: layer.trainable = True loaded_modelr.compile(loss='mean_squared_error', optimizer='adam') history = loaded_modelr.fit(x, y, batch_size=batch, epochs=train_epochs) dtstr = datetime.now().strftime("%d-%m-%Y_%I-%M-%S_%p") print(loaded_modelr.summary()) model_json = loaded_modelr.to_json() with open("model_ptMLP_MieReg_" + dtstr + ".json", "w") as json_file: json_file.write(model_json) loaded_modelr.save_weights("model_model_ptMLP_MieReg_" + dtstr + ".h5") print("Saved model to disk to", "model_model_ptMLP_MieReg_" + dtstr + ".json") print("and weights to") print("Saved model to disk to", "model_model_ptMLP_MieReg_" + dtstr + ".h5") return if classify == True: if x.shape[1] != 450: raise ValueError( 'This is a classification problem: x needs to be 450 datapoints in WVN range of 950-1800 1/cm' ) mod, h = add_layer() if miecorr == True: if y.shape[1] != x.shape[1]: raise ValueError( 'This is a regression problem: x and y need 909 datapoints in WVN range of 950-2300 1/cm' ) train_layer()
predicted = clf.predict(X_test) precision, recall, fscore, support = score(Y_test, predicted) p = p + precision r = r + recall f = f + fscore s = s + support ft = ft + np.array([fit_time]) scores['window'] = windows scores['precision'] = p / cv_sets scores['recall'] = r / cv_sets scores['fscore'] = f / cv_sets scores['support'] = normalize(s / cv_sets).reshape(3, ) scores['fit_time'] = normalize(ft / cv_sets).reshape(3, ) print("Done window %s..." % (str(windows))) scores_table = pd.DataFrame.from_dict(scores, orient='columns') final_list.append(scores_table) final_table = pd.concat(final_list, ignore_index=True) final_table.to_csv(output + str(kernel) + '.csv') for clas in set(final_table['labels']): temp = final_table.loc[final_table['labels'] == clas] temp.plot(x='window', y=['precision', 'recall', 'fscore', 'support', 'fit_time'],
np.random.seed(42) nrange = [100000000] nnsize = [6, 110, 2] num_data = 1000 for l in range(100): torch.manual_seed(l) def create_rand_params(h): if type(h) == nn.Linear: h.weight.data.uniform_(0, 1) model = ANN(size=nnsize) model.apply(create_rand_params) x, Y = make_blobs(num_data, n_features=6, centers=2, random_state=42) x = normalize(x) x_train, x_test, y_train, y_test = train_test_split(x, Y, test_size=0.4, random_state=0) np.save('x.npy', x_train) np.save('Y.npy', y_train) # randomise the labels for i in range(int(0.2 * len(y_train))): y_train[i] = np.random.randint(0, 2) x_train, y_train = shuffle(x_train, y_train) x_train = torch.Tensor(x_train).double() y_train = torch.Tensor(y_train).long() x_test = torch.Tensor(x_test).double() y_test = torch.Tensor(y_test).long() criterion = nn.CrossEntropyLoss()
def normalize_l1(x): return preprocessing.normalize(x, norm='l1')
def get_spatial_pyramid_feats(image_paths, max_level, feature): """ This function assumes that 'vocab_hog.npy' (for HoG) or 'vocab_sift.npy' (for SIFT) exists and contains an N x feature vector length matrix 'vocab' where each row is a kmeans centroid or visual word. This matrix is saved to disk rather than passed in a parameter to avoid recomputing the vocabulary every run. :param image_paths: a N array of string where each string is an image path, :param max_level: level of pyramid, :param feature: name of image feature representation. :return: an N x d matrix, where d is the dimensionality of the feature representation. In this case, d will equal the number of clusters or equivalently the number of entries in each image's histogram ('vocab_size'), multiplies with (1 / 3) * (4 ^ (max_level + 1) - 1). """ def _get_histogram_for_feature(img, vocab, feature, bins): features = feature_extraction(img, feature) try: dist = pdist(vocab, features) min_dist_index = dist.argmin(axis=0) hist, _ = np.histogram(min_dist_index, bins=bins) return hist except: hist, _ = np.histogram([], bins=bins) return hist def _spatial_pyramid_recursion(img, max_level, current_level, vocab, feature): if current_level > max_level: return np.zeros(vocab.shape[0]) else: img1 = img[0:int(img.shape[0]/2), 0:int(img.shape[1]/2),:] img2 = img[0:int(img.shape[0]/2), int(img.shape[1] / 2):int(img.shape[1]), :] img3 = img[int(img.shape[0]/2):int(img.shape[0]), 0:int(img.shape[1]/2),:] img4 = img[int(img.shape[0]/2):int(img.shape[0]), int(img.shape[1] / 2):int(img.shape[1]), :] #NOTE: visual check of division #cv2.imshow('img1',img1) #cv2.imshow('img2', img2) #cv2.imshow('img3', img3) #cv2.imshow('img4', img4) hist_img = _get_histogram_for_feature(img, vocab, feature, bins) current_weight = pow(2,current_level-max_level) all_histograms = np.array([current_weight*hist_img, _spatial_pyramid_recursion(img1, max_level, current_level + 1, vocab, feature), _spatial_pyramid_recursion(img2, max_level, current_level + 1, vocab, feature), _spatial_pyramid_recursion(img3, max_level, current_level + 1, vocab, feature), _spatial_pyramid_recursion(img4, max_level, current_level + 1, vocab, feature)]) return all_histograms.sum(axis=0) if feature == 'HoG': vocab = np.load('vocab_hog.npy') elif feature == 'SIFT': vocab = np.load('vocab_sift.npy') vocab_size = vocab.shape[0] # Your code here. You should also change the return value. bins = range(-1,vocab_size) # Your code here. You should also change the return value. all_histograms = np.empty((0, vocab_size)) i = 0 for path in image_paths: img = cv2.imread(path)[:, :, ::-1] print('iter: ' + str(i)) sp_histograms = _spatial_pyramid_recursion(img, max_level, 1, vocab, feature) sp_histograms = normalize(sp_histograms.reshape(1, -1), norm="l2") all_histograms = np.vstack((all_histograms, sp_histograms)) i += 1 return all_histograms
print('Computing gmm with ' + str(k) + ' centroids') gmm = ynumpy.gmm_learn(np.float32(Desc), k) io.save_object(gmm, 'gmm_NN_agg_features_max') # Compute the fisher vectors of the training images print('Computing fisher vectors') fisher = np.zeros((len(Train_descriptors), k * 1 * 2), dtype=np.float32) for i in xrange(len(Train_descriptors)): descriptor = Train_descriptors[i] # descriptor = np.float32(pca.transform(descriptor)) aux=ynumpy.fisher(gmm, descriptor, include=['mu', 'sigma']) fisher[i, :] = np.reshape(aux, [1, aux.shape[0]]) # L2 normalization - reshape to avoid deprecation warning, checked that the result is the same fisher[i, :] = preprocessing.normalize(fisher[i, :].reshape(1,-1), norm='l2') # Train an SVM classifier stdSlr = StandardScaler().fit(fisher) D_scaled = stdSlr.transform(fisher) print 'Training the SVM classifier...' clf = svm.SVC(kernel=kernels.intersection_kernel, C=C, probability=True).fit(D_scaled, train_labels) io.save_object(clf, 'clf_NN_pca256') #clf = io.load_object('clf_NN',ignore=False) # get all the test data and predict their labels fisher_test = np.zeros((len(test_images_filenames), k * 1* 2), dtype=np.float32) for i in range(len(test_images_filenames)): img = image.load_img(test_images_filenames[i], target_size=(224, 224)) x = image.img_to_array(img)
cell_types.append(id_to_type[cell_id]) ages.append(14.5) tprint('Found {} valid cells among all datasets'.format(len(valid_idx))) return valid_idx, np.array(cell_types), np.array(ages) datasets, genes_list, n_cells = load_names(data_names, norm=False) qc_idx, cell_types, ages = keep_valid(datasets) datasets, genes = merge_datasets(datasets, genes_list) X = vstack(datasets) X = X[qc_idx] if not os.path.isfile('data/dimred/{}_{}.txt'.format(DR_METHOD, NAMESPACE)): mkdir_p('data/dimred') tprint('Dimension reduction with {}...'.format(DR_METHOD)) X_dimred = reduce_dimensionality(normalize(X), dim_red_k=DIMRED) tprint('Dimensionality = {}'.format(X_dimred.shape[1])) np.savetxt('data/dimred/{}_{}.txt'.format(DR_METHOD, NAMESPACE), X_dimred) else: X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format(DR_METHOD, NAMESPACE)) dataset = AnnData(X) dataset.var['gene_symbols'] = genes dataset.obs['cell_types'] = ['mca_han_etal_fetal_' + l for l in cell_types] dataset.obs['ages'] = ages datasets = [dataset] namespaces = [NAMESPACE]
# problem 2.6 x1 = [22, 1, 42, 10] x2 = [20, 0, 36, 8] euclidean = distance.euclidean(x1, x2) manhattan = distance.cityblock(x1, x2) minkowski = distance.minkowski(x1, x2, p=3) supremum = distance.chebyshev(x1, x2) # problem 2.8 x_exist = [[1.5, 1.7], [2, 1.9], [1.6, 1.8], [1.2, 1.5], [1.5, 1.0]] x = [1.4, 1.6] sim = np.zeros((5, 4)) for i in range(0, 5): sim[i, 0] = distance.euclidean(x_exist[i], x) sim[i, 1] = distance.cityblock(x_exist[i], x) sim[i, 2] = distance.chebyshev(x_exist[i], x) sim[i, 3] = 1 - distance.cosine(x_exist[i], x) # cosine similarity print(sim) x = [[1.4, 1.6]] x_norm = preprocessing.normalize(x, norm='l2', axis=1) x_exist_norm = preprocessing.normalize(x_exist, norm='l2', axis=1) print(x_norm) print(x_exist_norm) sim_norm = np.zeros((5, 1)) for i in range(0, 5): sim_norm[i, 0] = distance.euclidean(x_exist_norm[i], x_norm) print(sim_norm)
def normalize_l2(x): return preprocessing.normalize(x)
def naive_bayes_algo(self): X = [] Y = [] with open('../Data/full_table.csv', 'r') as file: for line in csv.reader(file, delimiter = ','): if len(line) == 13: try: zhvi = float(line[5]) property_type = line[6] room_type = line[7] accommodates = int(line[8]) bathrooms = float(line[9]) beds = int(line[10]) bed_type = line[11] price = float(line[12]) x = { 'zhvi': zhvi, 'property_type': property_type, 'room_type': room_type, 'accommodates': accommodates, 'bathrooms': bathrooms, 'beds': beds, 'bed_type': bed_type } y = price X.append(x) Y.append(y) except: pass # The DictVectorizer converts data from a dictionary to an array vec = DictVectorizer() # Convert X to Array X = vec.fit_transform(X).toarray() # Normalize Data X = preprocessing.normalize(X) # Split X and Y into training and testing sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) # Naive Bayes Regression model = GaussianNB() model.fit(X_train, Y_train) Y_pred = model.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Naive Bayes') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2)) # With Boosting model_boost = AdaBoostRegressor(GaussianNB()) model_boost.fit(X_train, Y_train) Y_pred = model_boost.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Naive Bayes (with AdaBoost)') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2)) # With Bagging model_bag = BaggingRegressor(GaussianNB()) model_bag.fit(X_train, Y_train) Y_pred = model_bag.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Naive Bayes (with Bagging)') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2))
def gen_vectors(nb, dim): vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] vectors = preprocessing.normalize(vectors, axis=1, norm='l2') return vectors.tolist()
def main(args): np.seterr(divide="ignore") # POT has issues with divide by zero errors source_lang = args.source_lang target_lang = args.target_lang source_vectors_filename = args.source_vector target_vectors_filename = args.target_vector vectors_source = load_embeddings(source_vectors_filename) vectors_target = load_embeddings(target_vectors_filename) source_defs_filename = args.source_defs target_defs_filename = args.target_defs batch = args.batch input_mode = args.mode input_paradigm = args.paradigm run_method = list() run_paradigm = list() if input_paradigm == "all": run_paradigm.extend(("matching", "retrieval")) else: run_paradigm.append(input_paradigm) if input_mode == "all": run_method.extend(["wmd", "snk"]) else: run_method.append(input_mode) defs_source = [ line.rstrip("\n") for line in open(source_defs_filename, encoding="utf8") ] defs_target = [ line.rstrip("\n") for line in open(target_defs_filename, encoding="utf8") ] clean_src_corpus, clean_src_vectors, src_keys = process_corpus( set(vectors_source.keys()), defs_source, vectors_source, source_lang) clean_target_corpus, clean_target_vectors, target_keys = process_corpus( set(vectors_target.keys()), defs_target, vectors_target, target_lang) take = args.instances common_keys = set(src_keys).intersection(set(target_keys)) take = min(len(common_keys), take) # you can't sample more than length experiment_keys = random.sample(common_keys, take) instances = len(experiment_keys) clean_src_corpus = list(clean_src_corpus[experiment_keys]) clean_target_corpus = list(clean_target_corpus[experiment_keys]) del vectors_source, vectors_target, defs_source, defs_target vec = CountVectorizer().fit(clean_src_corpus + clean_target_corpus) common = [ word for word in vec.get_feature_names() if word in clean_src_vectors or word in clean_target_vectors ] W_common = [] for w in common: if w in clean_src_vectors: W_common.append(np.array(clean_src_vectors[w])) else: W_common.append(np.array(clean_target_vectors[w])) if not batch: print( f"{source_lang} - {target_lang}\n" + f" document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}\n" + f" vocabulary size: {len(W_common)}") W_common = np.array(W_common) W_common = normalize(W_common) vect = TfidfVectorizer(vocabulary=common, dtype=np.double, norm=None) vect.fit(clean_src_corpus + clean_target_corpus) X_train_idf = vect.transform(clean_src_corpus) X_test_idf = vect.transform(clean_target_corpus) for paradigm in run_paradigm: WassersteinDriver = None if paradigm == "matching": WassersteinDriver = WassersteinMatcher else: WassersteinDriver = WassersteinRetriever for metric in run_method: if not batch: print( f"{paradigm} - {metric} on {source_lang} - {target_lang}") clf = WassersteinDriver(W_embed=W_common, n_neighbors=5, n_jobs=14, sinkhorn=(metric == "snk")) clf.fit(X_train_idf[:instances], np.ones(instances)) p_at_one, percentage = clf.align(X_test_idf[:instances], n_neighbors=instances) if not batch: print( f"P @ 1: {p_at_one}\n{percentage}% {instances} definitions\n" ) else: fields = [ f"{source_lang}", f"{target_lang}", f"{instances}", f"{p_at_one}", f"{percentage}", ] with open(f"{metric}_{paradigm}_results.csv", "a") as f: writer = csv.writer(f) writer.writerow(fields)
y = y.drop(y.columns[0], axis=1) print('Done.') ########################################## # split data into training and testing set ########################################## print('Reducing and splitting..') #PCA on x pca = decomposition.PCA(n_components=700) x = pca.fit_transform(x) # normalization x = preprocessing.normalize(x) # label encoding le = preprocessing.LabelEncoder() Y1 = y.apply(le.fit_transform) y = le.fit_transform(Y1) # complete label encoded array #splitting x_train, x_val, y_train, y_val \ = train_test_split(x, y, test_size=0.15, random_state=42 , shuffle=True) # #For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros #and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's # feature selection through PCA print('ready.')