def getNewClusterLabel(emb, initClusterlabel, NumberOfCluster): Clusterlabels = clustering(emb, num_clusters=NumberOfCluster) print('Clusterlabels: ', Counter(Clusterlabels)) print('initClusterlabel: ', initClusterlabel) # 假如出现只有一种类别的话,这个要做修改和调整的。 C = Counter(Clusterlabels) # print (C) for idx, v in C.items(): if v == 1: tTable = getOriginClusterLabel(initClusterlabel, Clusterlabels, idx) if tTable == -1: continue print('idx: ', idx, ', tTable: ', tTable) for tidx, k in enumerate(Clusterlabels): if Clusterlabels[tidx] == idx: Clusterlabels[tidx] = tTable # 删了一个label,后面的label往前移 for tidx, k in enumerate(Clusterlabels): if Clusterlabels[tidx] > idx: Clusterlabels[tidx] = Clusterlabels[tidx] - 1 NumberOfCluster = NumberOfCluster - 1 # Clusterlabels = clustering(emb, num_clusters=NumberOfCluster) return NumberOfCluster, Clusterlabels
def test_clustering(n_clusters): # Loading in the cleaned DF with open("pickles/profiles1.pkl", 'rb') as fp: data_frame = pickle.load(fp) clustered_df = clustering( data_frame, fn_vectorized_words=vectorized_words_count_vector, fn_algorithm_clustering=agglomerative_clustering, n_clusters=n_clusters) with open("pickles/clustered_profiles.pkl", "wb") as wb: clustered_df.to_csv(r"csv/clustered_profiles.csv", index=False) pickle.dump(clustered_df, wb)
def Get_Cluster_Stocks(): """Get cluster of different risk stocks, 5 stocks each risk cluster """ conn = connect_db() cur = conn.cursor() stocks = clustering(conn, cur) # cur.close() print(stocks) for risk_list in stocks: for idx, stock_id in enumerate(risk_list): cur.execute(f"select name from stock where id = {stock_id};") data = cur.fetchall() stock_name = str(data[0]) stock_name = stock_name[2:-3] # stock_name = get_stock_name(stock_id) risk_list[idx] = stock_name + '(' + str(stock_id) + ')' cur.close() low_risk_list = stocks[0] mid_risk_list = stocks[1] high_risk_list = stocks[2] return low_risk_list, mid_risk_list, high_risk_list
def train(self, adj_list, fea_list, y_train, y_val, y_test, train_mask, val_mask, test_mask, y_all, all_mask, rawlabels, needtSNE=False, rawFeature=[]): prec, rec, f1 = 0.0, 0.0, 0.0 nb_nodes = fea_list[0].shape[0] ft_size = fea_list[0].shape[1] nb_classes = y_train.shape[1] # nb_classes = len(set(rawlabels)) # adj = adj.todense() # features = features[np.newaxis] # [1, nb_node, ft_size] fea_list = [fea[np.newaxis] for fea in fea_list] adj_list = [adj[np.newaxis] for adj in adj_list] y_train = y_train[np.newaxis] y_val = y_val[np.newaxis] y_test = y_test[np.newaxis] y_all = y_all[np.newaxis] train_mask = train_mask[np.newaxis] val_mask = val_mask[np.newaxis] test_mask = test_mask[np.newaxis] all_mask = all_mask[np.newaxis] biases_list = [process.adj_to_bias(adj, [nb_nodes], nhood=1) for adj in adj_list] print('build graph...') with tf.Graph().as_default(): with tf.name_scope('input'): metric_ftr_in = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ft_size), name='metric_ftr_in') ftr_in_list = [tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, ft_size), name='ftr_in_{}'.format(i)) for i in range(len(fea_list))] bias_in_list = [tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, nb_nodes), name='bias_in_{}'.format(i)) for i in range(len(biases_list))] lbl_in = tf.placeholder(dtype=tf.int32, shape=( batch_size, nb_nodes, nb_classes), name='lbl_in') msk_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes), name='msk_in') attn_drop = tf.placeholder(dtype=tf.float32, shape=(), name='attn_drop') ffd_drop = tf.placeholder(dtype=tf.float32, shape=(), name='ffd_drop') is_train = tf.placeholder(dtype=tf.bool, shape=(), name='is_train') # forward logits, final_embedding, att_val, centers_embed, test_final_embeed = model.inference(ftr_in_list, nb_classes, nb_nodes, is_train, attn_drop, ffd_drop, bias_mat_list=bias_in_list, hid_units=hid_units, n_heads=n_heads, features=fea_list, labels=rawlabels, residual=residual, activation=nonlinearity, feature_size=ft_size) # final_embedding: checkout Tensor("Sum:0", shape=(286, 64), dtype=float32) # logits: checkout Tensor("ExpandDims_3:0", shape=(1, 286, 30), dtype=float32) # cal masked_loss # lab_list = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ), name='lab_list') # ftr_resh = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ft_size), name='ftr_resh') log_resh = tf.reshape(logits, [-1, nb_classes]) lab_resh = tf.reshape(lbl_in, [-1, nb_classes]) msk_resh = tf.reshape(msk_in, [-1]) print ("final_embedding: checkout", final_embedding) print ("logits: checkout", logits) print ("log_resh: checkout", log_resh) # print ("ftr_resh: ", ftr_resh) print ("lab_resh: ", lab_resh) print ("fea_list: ", fea_list) print ("centers_embed: ", centers_embed) print ("batch_size, nb_nodes, nb_classes, ft_size", batch_size, nb_nodes, nb_classes, ft_size) osm_caa_loss = OSM_CAA_Loss(batch_size=nb_nodes) osm_loss = osm_caa_loss.forward # final_embedding: checkout Tensor("Sum:0", shape=(286, 64), dtype=float32) # logits: checkout Tensor("ExpandDims_3:0", shape=(1, 286, 30), dtype=float32) # log_resh: checkout Tensor("Reshape:0", shape=(286, 30), dtype=float32) # ftr_resh: Tensor("ftr_resh:0", shape=(286, 100), dtype=float32) # lab_resh: Tensor("Reshape_1:0", shape=(286, 30), dtype=int32) osmLoss, checkvalue = osm_loss(final_embedding, rawlabels, centers_embed) # osmLoss, checkvalue = osm_loss(metric_ftr_in, rawlabels, centers_embed) SoftMaxloss = model.masked_softmax_cross_entropy(log_resh, lab_resh, msk_resh) loss = osmLoss # 为什么loss会固定 # loss = osmLoss # loss = SoftMaxloss accuracy = model.masked_accuracy(log_resh, lab_resh, msk_resh) # optimzie train_op = model.training(loss, lr, l2_coef) Path = 'pre_trained/{}/{}/{}'.format(dataset, dataset, self.name) self.mkdir(Path) checkpt_file = '{}/allMP_multi_{}_.ckpt'.format(Path, featype) print('model: {}'.format(checkpt_file)) saver = tf.train.Saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) vlss_mn = np.inf vacc_mx = 0.0 curr_step = 0 with tf.Session(config=config) as sess: sess.run(init_op) train_loss_avg = 0 train_acc_avg = 0 val_loss_avg = 0 val_acc_avg = 0 for epoch in range(nb_epochs): tr_step = 0 tr_size = fea_list[0].shape[0] # ================ training ============ while tr_step * batch_size < tr_size: fd1 = {i: d[tr_step * batch_size:(tr_step + 1) * batch_size] for i, d in zip(ftr_in_list, fea_list)} fd2 = {i: d[tr_step * batch_size:(tr_step + 1) * batch_size] for i, d in zip(bias_in_list, biases_list)} fd3 = {lbl_in: y_train[tr_step * batch_size:(tr_step + 1) * batch_size], msk_in: train_mask[tr_step * batch_size:(tr_step + 1) * batch_size], metric_ftr_in: rawFeature, is_train: True, attn_drop: 0.6, ffd_drop: 0.6} fd = fd1 fd.update(fd2) fd.update(fd3) _, loss_value_tr, acc_tr, att_val_train = sess.run([train_op, loss, accuracy, att_val], feed_dict=fd) test_check_value = sess.run(checkvalue, feed_dict=fd) print ("test_check_value: ", test_check_value) train_loss_avg += loss_value_tr train_acc_avg += acc_tr tr_step += 1 vl_step = 0 vl_size = fea_list[0].shape[0] # ============= val ================= while vl_step * batch_size < vl_size: # fd1 = {ftr_in: features[vl_step * batch_size:(vl_step + 1) * batch_size]} fd1 = {i: d[vl_step * batch_size:(vl_step + 1) * batch_size] for i, d in zip(ftr_in_list, fea_list)} fd2 = {i: d[vl_step * batch_size:(vl_step + 1) * batch_size] for i, d in zip(bias_in_list, biases_list)} fd3 = {lbl_in: y_val[vl_step * batch_size:(vl_step + 1) * batch_size], msk_in: val_mask[vl_step * batch_size:(vl_step + 1) * batch_size], metric_ftr_in: rawFeature, is_train: False, attn_drop: 0.0, ffd_drop: 0.0} fd = fd1 fd.update(fd2) fd.update(fd3) loss_value_vl, acc_vl = sess.run([loss, accuracy], feed_dict=fd) val_loss_avg += loss_value_vl val_acc_avg += acc_vl vl_step += 1 # import pdb; pdb.set_trace() print('Epoch: {}, att_val: {}'.format(epoch, np.mean(att_val_train, axis=0))) print('Training: loss = %.5f, acc = %.5f | Val: loss = %.5f, acc = %.5f | vl_step: %d, tr_step: %d' % (train_loss_avg / tr_step, train_acc_avg / tr_step, val_loss_avg / vl_step, val_acc_avg / vl_step, vl_step, tr_step)) if val_acc_avg / vl_step >= vacc_mx or val_loss_avg / vl_step <= vlss_mn: if val_acc_avg / vl_step >= vacc_mx and val_loss_avg / vl_step <= vlss_mn: vacc_early_model = val_acc_avg / vl_step vlss_early_model = val_loss_avg / vl_step saver.save(sess, checkpt_file) vacc_mx = np.max((val_acc_avg / vl_step, vacc_mx)) vlss_mn = np.min((val_loss_avg / vl_step, vlss_mn)) curr_step = 0 else: curr_step += 1 if curr_step == patience: print('Early stop! Min loss: ', vlss_mn, ', Max accuracy: ', vacc_mx) print('Early stop model validation loss: ', vlss_early_model, ', accuracy: ', vacc_early_model) break train_loss_avg = 0 train_acc_avg = 0 val_loss_avg = 0 val_acc_avg = 0 # check save saver.save(sess, checkpt_file) saver.restore(sess, checkpt_file) print('load model from : {}'.format(checkpt_file)) ts_size = fea_list[0].shape[0] ts_step = 0 ts_loss = 0.0 ts_acc = 0.0 while ts_step * batch_size < ts_size: fd1 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size] for i, d in zip(ftr_in_list, fea_list)} fd2 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size] for i, d in zip(bias_in_list, biases_list)} fd3 = {lbl_in: y_all[ts_step * batch_size:(ts_step + 1) * batch_size], msk_in: all_mask[ts_step * batch_size:(ts_step + 1) * batch_size], metric_ftr_in: rawFeature, is_train: False, attn_drop: 0.0, ffd_drop: 0.0} fd = fd1 fd.update(fd2) fd.update(fd3) loss_value_ts, acc_ts, jhy_final_embedding, test_final_embeed_check = sess.run([loss, accuracy, final_embedding, test_final_embeed], feed_dict=fd) ts_loss += loss_value_ts ts_acc += acc_ts ts_step += 1 xx = np.expand_dims(jhy_final_embedding, axis=0)[all_mask] xx2 = np.expand_dims(test_final_embeed_check, axis=0)[all_mask] yy = y_all[all_mask] print ("check fd") print('xx: {}, yy: {}, ts_size: {}, ts_step: {}, batch_size: {}'.format(xx.shape, yy.shape, ts_size, ts_step,batch_size)) labels, numberofLabels = self.getLabel(yy) from utils import clustering, pairwise_precision_recall_f1 clusters_pred = clustering(xx2, num_clusters=numberofLabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) print ('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', numberofLabels) if needtSNE: tSNEAnanlyse(xx, labels, join(settings.PIC_DIR, "HAN", "rawReature_%s_final.png" % (self.name))) tSNEAnanlyse(rawFeature, labels, join(settings.PIC_DIR, "HAN", "rawReature_%s_features.png" % (self.name))) tSNEAnanlyse(xx2, labels, join(settings.PIC_DIR, "HAN", "rawReature_%s_xx2.png" % (self.name))) tSNEAnanlyse(xx, clusters_pred, join(settings.PIC_DIR, "HAN", "rawReature_%s_result_label.png" % (self.name))) sess.close() return prec, rec, f1, xx2
def MetricDebug(self, adj_list, fea_list, y_train, y_val, y_test, train_mask, val_mask, test_mask, y_all, all_mask, rawlabels, needtSNE=False, rawFeature=[]): prec, rec, f1 = 0.0, 0.0, 0.0 nb_nodes = fea_list[0].shape[0] ft_size = fea_list[0].shape[1] nb_classes = y_train.shape[1] # nb_classes = len(set(rawlabels)) # adj = adj.todense() # features = features[np.newaxis] # [1, nb_node, ft_size] fea_list = [fea[np.newaxis] for fea in fea_list] adj_list = [adj[np.newaxis] for adj in adj_list] y_train = y_train[np.newaxis] y_val = y_val[np.newaxis] y_test = y_test[np.newaxis] y_all = y_all[np.newaxis] train_mask = train_mask[np.newaxis] val_mask = val_mask[np.newaxis] test_mask = test_mask[np.newaxis] all_mask = all_mask[np.newaxis] biases_list = [process.adj_to_bias(adj, [nb_nodes], nhood=1) for adj in adj_list] print('build graph...') with tf.Graph().as_default(): with tf.name_scope('input'): metric_ftr_in = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ft_size), name='metric_ftr_in') ftr_in_list = [tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, ft_size), name='ftr_in_{}'.format(i)) for i in range(len(fea_list))] bias_in_list = [tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, nb_nodes), name='bias_in_{}'.format(i)) for i in range(len(biases_list))] lbl_in = tf.placeholder(dtype=tf.int32, shape=( batch_size, nb_nodes, nb_classes), name='lbl_in') msk_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes), name='msk_in') attn_drop = tf.placeholder(dtype=tf.float32, shape=(), name='attn_drop') ffd_drop = tf.placeholder(dtype=tf.float32, shape=(), name='ffd_drop') is_train = tf.placeholder(dtype=tf.bool, shape=(), name='is_train') # forward logits, final_embedding, att_val, centers_embed, test_final_embeed = model.inference(ftr_in_list, nb_classes, nb_nodes, is_train, attn_drop, ffd_drop, bias_mat_list=bias_in_list, hid_units=hid_units, n_heads=n_heads, features=fea_list, labels=rawlabels, residual=residual, activation=nonlinearity, feature_size=ft_size) log_resh = tf.reshape(logits, [-1, nb_classes]) lab_resh = tf.reshape(lbl_in, [-1, nb_classes]) msk_resh = tf.reshape(msk_in, [-1]) osm_caa_loss = OSM_CAA_Loss(batch_size=nb_nodes) osm_loss = osm_caa_loss.forward osmLoss, checkvalue = osm_loss(final_embedding, rawlabels, centers_embed) SoftMaxloss = model.masked_softmax_cross_entropy(log_resh, lab_resh, msk_resh) loss = osmLoss accuracy = model.masked_accuracy(log_resh, lab_resh, msk_resh) # optimzie train_op = model.training(loss, lr, l2_coef) Path = 'pre_trained/{}/{}/{}'.format(dataset, dataset, self.name) self.mkdir(Path) checkpt_file = '{}/allMP_multi_{}_.ckpt'.format(Path, featype) saver = tf.train.Saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) ts_size = fea_list[0].shape[0] ts_step = 0 ts_loss = 0.0 ts_acc = 0.0 with tf.Session(config=config) as sess: sess.run(init_op) saver.restore(sess, checkpt_file) while ts_step * batch_size < ts_size: fd1 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size] for i, d in zip(ftr_in_list, fea_list)} fd2 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size] for i, d in zip(bias_in_list, biases_list)} fd3 = {lbl_in: y_all[ts_step * batch_size:(ts_step + 1) * batch_size], msk_in: all_mask[ts_step * batch_size:(ts_step + 1) * batch_size], metric_ftr_in: rawFeature, is_train: False, attn_drop: 0.0, ffd_drop: 0.0} fd = fd1 fd.update(fd2) fd.update(fd3) test_final_embeed_check = sess.run([ test_final_embeed], feed_dict=fd) ts_step += 1 xx2 = np.expand_dims(test_final_embeed_check, axis=0)[all_mask] yy = y_all[all_mask] labels, numberofLabels = self.getLabel(yy) from utils import clustering, pairwise_precision_recall_f1 clusters_pred = clustering(xx2, num_clusters=numberofLabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) print ('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', numberofLabels) if needtSNE: tSNEAnanlyse(rawFeature, labels, join(settings.PIC_DIR, "MetricLearning", "rawReature_%s_features.png" % (self.name))) tSNEAnanlyse(xx2, labels, join(settings.PIC_DIR, "MetricLearning", "rawReature_%s_xx2.png" % (self.name)))
def train(name, needtSNE=False, savefile=True): adj, adj2, features, labels, Clusterlabels, Ids = load_local_data( name=name) initClusterlabel = Clusterlabels oneHotClusterLabels = toOneHot(Clusterlabels) num_logits = len(oneHotClusterLabels[0]) # enc.transform([['Female', 1], ['Male', 4]]).toarray() print('debuging ', oneHotClusterLabels.shape) originClusterlabels = Clusterlabels n_clusters = len(set(labels)) OldClusterlabels = Clusterlabels originNumberOfClusterlabels = len(set(Clusterlabels)) num_nodes = adj.shape[0] input_feature_dim = features.shape[1] adj_norm, adj_label = NormalizedAdj(adj) adj_norm2, adj_label2 = NormalizedAdj(adj2) if FLAGS.is_sparse: # TODO to test # features = sparse_to_tuple(features.tocoo()) # features_nonzero = features[1].shape[0] features = features.todense() # TODO else: features = normalize_vectors(features) # Define placeholders placeholders = { # 'features': tf.sparse_placeholder(tf.float32), 'features': tf.placeholder(tf.float32, shape=(None, input_feature_dim)), 'labels': tf.placeholder(tf.int64, shape=(None), name='labels'), 'graph1': tf.sparse_placeholder(tf.float32), 'graph2': tf.sparse_placeholder(tf.float32), 'graph1_orig': tf.sparse_placeholder(tf.float32), 'graph2_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()), 'epoch': tf.placeholder_with_default(0., shape=()), 'clusterEpoch': tf.placeholder_with_default(0., shape=()) } # pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # negative edges/pos edges # norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.nnz) * 2) def get_embs(): feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean_1, feed_dict=feed_dict) # z_mean is better return emb def getGraphDetail(adj): pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # negative edges/pos edges norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.nnz) * 2) return {'norm': norm, 'pos_weight': pos_weight} # return pos_weight, norm # loss1s = [] # loss2s = [] # loss3s = [] n_clusters = len(set(labels)) graph1 = getGraphDetail(adj) graph2 = getGraphDetail(adj2) # construct adj_orig graph1['labels'] = tf.reshape( tf.sparse_tensor_to_dense(placeholders['graph1_orig'], validate_indices=False), [-1]) graph2['labels'] = tf.reshape( tf.sparse_tensor_to_dense(placeholders['graph2_orig'], validate_indices=False), [-1]) # Train model for clusterepoch in range(FLAGS.clusterEpochs): print('cluster epoch: ', clusterepoch) # tf.reset_default_graph() # num_logits model = BuildModel(placeholders, input_feature_dim, num_nodes, name='model%d' % (clusterepoch), num_logits=num_logits) # Session # tf.reset_default_graph() # sess = tf.InteractiveSession() opt = OptimizerDualGCNAutoEncoder(model=model, num_nodes=num_nodes, z_label=Clusterlabels, name='model%d' % (clusterepoch), graph1=graph1, graph2=graph2) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Centers # centers = opt.centers for epoch in range(FLAGS.epochs): # print ('epoch: ', epoch) # opt.epoch = epoch model.epoch = epoch # Construct feed dictionary # Number of logics and preb feed_dict = construct_feed_dict(adj_norm, adj_label, adj_norm2, adj_label2, features, placeholders, Clusterlabels, epoch, clusterepoch + 1) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update # outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) outs = sess.run([opt.opt_op, opt.cost], feed_dict=feed_dict) # [Loss, softmax_loss, loss3, centerloss, reconstructloss] = sess.run([opt.cost, opt.softmax_loss, opt.loss3, opt.centerloss, opt.reconstructloss], feed_dict=feed_dict) # [Loss, loss3, centerloss, reconstructloss, L2loss] = sess.run([opt.cost, opt.loss3, opt.centerloss, opt.reconstructloss, opt.L2loss], feed_dict=feed_dict) [Loss, reconstructloss] = sess.run([opt.cost, opt.reconstructloss], feed_dict=feed_dict) # print ('loss: ', Loss, ', loss1: ', loss1, ', loss2: ', loss2 ,', centerloss: ', centerloss, ', acc: ', outs[2]) print('epoch: ', epoch, ', loss: ', Loss, ', reconstructloss : ', reconstructloss) # if clusterepoch != FLAGS.clusterEpochs -1 : emb = get_embs() X_new = TSNE(learning_rate=100).fit_transform(emb) tClusterLabels = [] Maxscore = -10000 NumberOfCluster = 0 for nc in range(2, originNumberOfClusterlabels + 1, 1): TempLabels = clustering(X_new, nc) score = silhouette_score(X_new, TempLabels) print('nc: ', nc, ', score: ', score) if score > Maxscore: Maxscore = score tClusterLabels = TempLabels NumberOfCluster = nc print('NumberOfCluster: ', NumberOfCluster, ', originNumberOfClusterlabels : ', originNumberOfClusterlabels, ', Maxscore: ', Maxscore) if NumberOfCluster < 0 or NumberOfCluster > originNumberOfClusterlabels: continue # 符合不断缩小的要求 # 重新修改这些参数 Clusterlabels = tClusterLabels originNumberOfClusterlabels = NumberOfCluster prec, rec, f1 = pairwise_precision_recall_f1(Clusterlabels, labels) print('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', originNumberOfClusterlabels) Cc = Counter(Clusterlabels) print(Cc) if needtSNE: sNEComparingAnanlyse(emb, OldClusterlabels, labels, Clusterlabels, savepath=join( settings.PIC_DIR, "%s_%s.png" % (name, clusterepoch))) # tSNEAnanlyse(emb, labels, join(settings.PIC_DIR, "%s.png"%(clusterepoch)) ) # tf.reset_default_graph() emb = get_embs() emb_norm = normalize_vectors(emb) clusters_pred = clustering(emb_norm, num_clusters=originNumberOfClusterlabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) print('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', originNumberOfClusterlabels) # lossPrint(range(FLAGS.epochs), loss1s, loss2s, loss3s) if needtSNE: tSNEAnanlyse(emb, labels, join(settings.PIC_DIR, "%s_final.png" % (name))) tf.reset_default_graph() return [prec, rec, f1], num_nodes, n_clusters
plt.savefig('NON_DEEP_mean_std_cls' + str(N_CLUSTERING)) return y if __name__ == '__main__': models = [ 'resnet101', 'resnet50', 'resnet18', 'vgg11', 'sobel', 'laplacian', 'HOG' ] datasets = ['ICDAR15', 'MSRA-TD500', 'MSRA-TD500.blur'] num_of_clustering = 2 dataset = datasets[1] model = models[6] image_root_path = os.path.join(os.path.expanduser('~'), 'Documents', dataset, 'trainim') root = '/home/litianjiao/codes/curriculum/extracted_feats' means, vars = main(model=model, dataset=dataset) y = nonDeepClustering(N_CLUSTERING=num_of_clustering, MEAN=means, VAR=vars) clustering(NUM_OF_CLUSTERING=num_of_clustering, IMG_ROOT=image_root_path, ROOT=root, ID=y, DATASET=dataset, MODEL=model)
with tf.Session() as sess: sess.run(init_op) fd = {ftr_input: features} while epoch < epochs: _, losscheck, value2 = sess.run([train_op, loss, checkvalue], feed_dict=fd) print ("epoch: {} loss: {}, checkvalue: {}".format(epoch, losscheck, value2)) epoch += 1 print ("final_embed: ", final_embed) embedding = sess.run([final_embed], feed_dict=fd) embedding = embedding[0] print ("embedding: ", embedding) from utils import clustering, pairwise_precision_recall_f1 clusters_pred = clustering(embedding, num_clusters=nb_class) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, rawlabels) print ('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', nb_class) tSNEAnanlyse(embedding, rawlabels, join(settings.PIC_DIR, "PureMetricLoss", "%s_final.png" % (name))) tSNEAnanlyse(features, rawlabels, join(settings.PIC_DIR, "PureMetricLoss", "%s_features.png" % (name))) # my_KNN(xx, yy) # my_Kmeans(xx, yy) sess.close()
for j in range(N_CLUSTERING): plt.plot(class_means[j], class_vars[j], color_map[j]) plt.savefig('NON_DEEP_mean_std_cls' + str(N_CLUSTERING)) return y if __name__ == '__main__': models = [ 'resnet101', 'resnet50', 'resnet18', 'vgg11', 'sobel', 'laplacian' ] datasets = ['ICDAR15', 'MSRA-TD500', 'MSRA-TD500.blur'] num_of_clustering = 2 dataset = datasets[1] image_root_path = os.path.join(os.path.expanduser('~'), 'Documents', dataset, 'trainim') root = '/home/litianjiao/codes/curriculum/extracted_feats' # main(model=models[4], dataset=datasets[0]) means, vars = main(model=models[4], dataset=dataset) y = nonDeepClustering(N_CLUSTERING=num_of_clustering, MEAN=means, VAR=vars) clustering(NUM_OF_CLUSTERING=num_of_clustering, IMG_ROOT=image_root_path, ROOT=root, ID=y)
from utils import clustering if __name__ == "__main__": load_dotenv() conn = psycopg2.connect(database="teamc", user=os.getenv("user"), password=os.getenv("password"), host=os.getenv("host"), port="5432") cur = conn.cursor() # Add_History: # cur.execute("SELECT * FROM stock") # stocks = cur.fetchall() # today = datetime.date.today() # tomorrow = today + datetime.timedelta(days = 1) # three_years_ago = today - relativedelta(years = 3) # for stock in stocks: # Add_History(conn, cur, stock[0], three_years_ago.strftime('%Y%m%d'), tomorrow.strftime('%Y%m%d')) # Add_Analysis: # cur.execute("SELECT * FROM stock") # stocks = cur.fetchall() # for stock in stocks: # Add_Analysis(conn, cur, stock[0]) # output: [[low], [medium], [high]] stocks = clustering(conn, cur) cur.close()
def clusterTest(embedding, numberofLabels): clusters_pred = clustering(embedding, num_clusters=numberofLabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) return [prec, rec, f1]
EndIndex = -2 featurePath = getPATH(name, idf_threshold, 'feature_and_label', ispretrain) # idx_features_labels = np.genfromtxt(join(settings.DATA_DIR, 'local', 'graph-{}'.format(idf_threshold)), dtype=np.dtype(str)) idx_features_labels = np.genfromtxt(featurePath, dtype=np.dtype(str)) features = np.array(idx_features_labels[:, 1:EndIndex], dtype=np.float32) # sparse? rawlabels = encode_labels(idx_features_labels[:, EndIndex]) pids = idx_features_labels[:, 0] return features, pids, rawlabels def load_test_names(): return data_utils.load_json(settings.DATA_DIR, 'test_name_list2.json') Res = {} names = load_test_names() for name in names: features, pids, rawlabels = loadFeature(name, ispretrain=False) tSNEAnanlyse( features, rawlabels, join(settings.PIC_DIR, "MetricLearning", "rawReature_%s_train.png" % (name))) numberofLabels = len(set(rawlabels)) clusters_pred = clustering(features, num_clusters=numberofLabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, rawlabels) Res[name] = {"prec": prec, "rec": rec, "f1": f1} print(Res)