def train(self, mtd = "margin", backupfile = "backupfile"): # TODO implement sample_weight + make method to shuffle and return sublist with data_limit backupfile += ".opt-"+str(self.optimization_limit)+"-"+self.optimization_method+".txt" for i in range(self.budget): if len(self.Ux) <= 1: break # self.viz_A = []; self.viz_B = []; self.viz_C = []; self.viz_D = []; self.viz_E = []; self.viz_F = [] ids, scores = self.sortForInformativeness(mtd) id = ids[0] qx = self.Ux[id] qy = self.Uy[id] self.Lx.append(qx) self.Ly.append(qy) self.Ux.pop(id) self.Uy.pop(id) self.clf.X = self.Lx; self.clf.Y = self.Ly self.clf.train() test_accuracy = self.clf.getTestAccuracy( self.Tx, self.Ty ) self.accuracys.append( test_accuracy ) print "i=", i+1, "; acc=%.4f"%(test_accuracy*100), "%.4f"%(np.mean(self.accuracys)*100), "%.4f"%(np.average(self.accuracys, weights = range(1,1+len(self.accuracys)))*100), scores[0] if (i+1)%10 == 0: Util.pickleSave(backupfile, self) viz = Visualize(); viz.plot( [range(len(self.accuracys)), self.accuracys], fig = backupfile+".png", color = 'r', marker = '-' )
def threadFn(self): while True: refImg, refBox, refClasses, pBoxes, pClasses, pScores, pProposals, pProposalScores, pAllBoxes, pAllScores, pAllClasses = self.queue.get( ) refImg = refImg[0] a = Visualize.drawBoxes(refImg, refBox, refClasses, self.dataset.getCaptions(refClasses), self.palette) b = Visualize.drawBoxes(refImg, pBoxes, pClasses, self.dataset.getCaptions(pClasses), self.palette, scores=pScores) c = Visualize.drawBoxes(refImg, pProposals, None, None, self.palette, scores=pProposalScores * 0.3) d = Visualize.drawBoxes(refImg, pAllBoxes, pAllClasses, self.dataset.getCaptions(pAllClasses), self.palette, scores=pAllScores) preview = Visualize.tile( 2, 2, [a, b, c, d], ["input", "output", "proposals", "all detections"]) cv2.imwrite(self.opt.name + "/preview/preview.jpg", preview) self.queue.task_done()
def plot_colored_signals(self, times, axes, labels, path, figname): viz = Visualize() signame_labels = [viz.colors[y % len(viz.colors)] for y in labels] if len(axes) < len(self.sigReaders): return for isr, sr in enumerate(self.sigReaders): figurename = path + sr.signal_name + "_" + str(time.time()) + figname viz.plot([times, axes[isr]], axs_labels=['Time', sr.signal_name], color=signame_labels, fig=figurename) figurename = path + "_clustering_projection_AllSignals_" + str(time.time()) + figname viz.plot(axes, color=signame_labels, fig=figurename)
def __init__(self, *params): Visualize.__init__(self, params) self.host = params[0] self.port = params[1] self.index = params[2] if len(params) > 3: self.connectionDict = params[-1] else: self.connectionDict = {} self.es = Elasticsearch([self.host + ":" + str(self.port)], *self.connectionDict)
def plot(self, fig=None): if not self.done(): return viz = Visualize() if len(self.X[0]) > 3: X = viz.PCA_Transform( list(zip(*self.X)) ) else: X = self.X unique_labels = np.unique(self.Y) clusters = { ul:[] for ul in unique_labels } for i in range( len(X) ): clusters[ self.Y[i] ].append( X[i] ) centers_for_plot = [] # Not the real centers because dimension was reduced using PCA for label in clusters: centers_for_plot.append( [np.mean(col) for col in list(zip(* clusters[label] )) ] ) viz.do_plot(list(zip(*centers_for_plot)), marker='o', color='m') viz.plot_groups(clusters, fig)
class Explore: def __init__(self, data): self.data = data self.viz = Visualize() #--------------------------------------- def fire(self): range_features = range( len(self.data.X_transpose) ) for i in range_features: axs = [ self.data.X_transpose[i] ] axs_labels = [ self.data.features_name[i] ] self.viz.plot(axs, axs_labels = axs_labels, color = self.data.Y, marker = '.', fig = "explore_1D_"+str(i)+".png") pairs = [ (i,j) for i in range_features for j in range_features ] for pair in pairs: if pair[0] != pair[1]: axs = [self.data.X_transpose[id] for id in pair] axs_labels = [self.data.features_name[id] for id in pair] self.viz.plot(axs, axs_labels = axs_labels, color = self.data.Y, marker = '.', fig = "explore_2D_"+str(pair)+".png") triplets = [ (i,j,k) for i in range_features for j in range_features for k in range_features ] for triplet in triplets: if triplet[0] != triplet[1] and triplet[1] != triplet[2] and triplet[0] != triplet[2]: axs = [self.data.X_transpose[id] for id in triplet] axs_labels = [self.data.features_name[id] for id in triplet] self.viz.plot(axs, axs_labels = axs_labels, color = self.data.Y, marker = '.', fig = "explore_3D_"+str(triplet)+".png")
def __init__(self, opt, runManager, dataset, net, images, boxes, classes): self.opt = opt self.queue = queue.Queue() self.dataset = dataset self.palette = Visualize.Palette(dataset.categoryCount()) predBoxes, predScores, predClasses = net.getBoxes() allPredBoxes, allPredScores, allPredClasses = net.getBoxes( scoreThreshold=0) proposals, proposalScores = net.getProposals() runManager.add("preview", [ images, boxes, classes, predBoxes, predClasses, predScores, proposals, proposalScores, allPredBoxes, allPredScores, allPredClasses ], modRun=self.opt.displayInterval) self.startThread()
def main(): stop_words = get_stop_words(STOP_WORDS_PATH) data = Initialize_Data(); visualizer = Visualize(); data.initialize_twitter_posts(TWITTER_POSTS_CSV, TWITTER_DATA_DIR) data.initialize_facebook_posts(FACEBOOK_POSTS_CSV, FACEBOOK_DATA_DIR) # Visalize daya df = np.array(data.posts); lf= np.array(data.labels); pos_ind = lf == "positive"; neg_ind = lf == "negative" pos = df[pos_ind] neg = df[neg_ind] visualizer.plot_data_distibution([pos.shape[0], neg.shape[0]], ["positive", "negative"], "Training set distribution") # Cleanup posts text_Cleanuper = Posts_Cleansing(data) text_Cleanuper.cleanup(Text_Cleanuper()) # Train and Test Model clf = train_test_model(create_ngram_model(frozenset(stop_words)), np.array(data.posts), np.array(data.labels) == "positive") # Find best Model params and train clf = grid_search_model(create_ngram_model, np.array(data.posts), np.array(data.labels) == "positive", frozenset(stop_words)) print('Saving model') save_model(clf, NAIVE_BAYES_MODEL_PATH); print('Loading model') trained_model = load_model(NAIVE_BAYES_MODEL_PATH) train_test_model(trained_model, np.array(data.posts), np.array(data.labels) == "positive") importance = get_most_important_features(trained_model.named_steps['vect'].vocabulary_.items(), trained_model.named_steps['clf'], 10) top_scores = [a[0] for a in importance[0]['tops']] top_words = [a[1] for a in importance[0]['tops']] bottom_scores = [a[0] for a in importance[0]['bottom']] bottom_words = [a[1] for a in importance[0]['bottom']] visualizer.plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words for relevance") Y_predicted_word2vec = trained_model.predict(["Նա վատ աղջիկ է"]) print(Y_predicted_word2vec)
def plot_colored_signals(self, times, axes, labels, path, figname): viz = Visualize() signame_labels = [viz.colors[y % len(viz.colors)] for y in labels] if len(axes) < len(self.sigReaders): return for isr, sr in enumerate(self.sigReaders): figurename = path + sr.signal_name + "_" + str( time.time()) + figname viz.plot([times, axes[isr]], axs_labels=['Time', sr.signal_name], color=signame_labels, fig=figurename) figurename = path + "_clustering_projection_AllSignals_" + str( time.time()) + figname viz.plot(axes, color=signame_labels, fig=figurename)
def plot(self, fig=None): if not self.done(): return viz = Visualize() if len(self.X[0]) > 3: X = viz.PCA_Transform(list(zip(*self.X))) else: X = self.X unique_labels = np.unique(self.Y) clusters = {ul: [] for ul in unique_labels} for i in range(len(X)): clusters[self.Y[i]].append(X[i]) centers_for_plot = [ ] # Not the real centers because dimension was reduced using PCA for label in clusters: centers_for_plot.append( [np.mean(col) for col in list(zip(*clusters[label]))]) viz.do_plot(list(zip(*centers_for_plot)), marker='o', color='m') viz.plot_groups(clusters, fig)
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' ] # categories=['diningtable', 'person', 'bottle', 'boat', 'train', 'bird', 'dog', 'cat', 'tvmonitor', 'cow', 'car', 'sofa', # 'horse', 'chair', 'pottedplant', 'bicycle', 'motorbike', 'aeroplane', 'sheep', 'bus'] palette = Visualize.Palette(len(categories)) image = tf.placeholder(tf.float32, [None, None, None, 3]) net = BoxInceptionResnet(image, len(categories), name="boxnet") boxes, scores, classes = net.getBoxes(scoreThreshold=opt.threshold) input = PreviewIO.PreviewInput(opt.i) output = PreviewIO.PreviewOutput(opt.o, input.getFps()) def preprocessInput(img): def calcPad(size): m = size % 32 p = int(m / 2) s = size - m
def main_fuction(readmidfile=False): log=ProcessLog('ClusterModel.log',1,'cluster model') ProcessLog.loggerName='cluster model' dataBasePaths=[] datatemp=dataBasePaths[1:] dataBasePaths.append('D:/NJ-KING-CAO3.db') distance_matrix=[] dm = Distance_Matrix() V=Visualize() moduleLabelDict={} ctkm=None ft = lambda x: x.nodesCount <= 25 # mt=MemoryTest() # print('test begin!') # mt.test() # print('sleep begin!') # time.sleep(500) # print('sleep end!') # # def eee(): pp = Preprocess() begin=time.time() pp.extractFromFiles_Robert(True, True, treeNumberLimit=5000, nodeLimit=25) end=time.time() log.getlog().debug("Read file took %f seconds."%(end-begin)) pp.generateTrees() if readmidfile: try: modulelabelfile=open("ModuleLabel.txt",'r') for l in modulelabelfile: row=[x for x in l.split(',')] if len(row)==2: moduleLabelDict[row[0]]=int(row[1]) modulelabelfile.close() # mc = ModuleCluster() # feature_names, feature_matrix = mc.moduleStatistic_feature_extraction(pp.moduleStatistic) # for epsNumber in range(100): # eps = float((epsNumber + 1) / 100) # labels = mc.cluster(eps) # file_open = open("ModuleLabel_eps=" + str(eps) + ".txt", 'w') # i = 0 # for moduleName in mc.moduleNameList: # file_open.write("%s,%d\n" % (moduleName, labels[i])) # i += 1 # file_open.close() # print("eps=%f is done." % eps) # V.cluster_result_linechart(eps_list, cluster_num_list, noise_num_list) # labels = mc.cluster() distancefile=open("Distancenew.txt",'r') colnum=0 rownum=0 row=[] for l in distancefile: row = [float(x) for x in l.split()] if len(row) > 0: distance_matrix.append(row) rownum+=1 print("Distance matrix read. %d trees"%rownum) distancefile.close() if rownum!=len(row): raise Exception("Distance file format wrong!") except Exception as e: print(e) #readmidfile=False if readmidfile==False: mc=ModuleCluster() feature_names,feature_matrix=mc.moduleStatistic_feature_extraction(pp.moduleStatistic) for epsNumber in range(100): eps=float((epsNumber+1)/100) labels = mc.cluster(eps) file_open = open("ModuleLabel_eps="+str(eps)+".txt", 'w') i = 0 for moduleName in mc.moduleNameList: file_open.write("%s,%d\n" % (moduleName, labels[i])) i += 1 file_open.close() print("eps=%f is done."%eps) labels=mc.cluster() moduleLabelDict=mc.moduleLabelDict file_open=open("ModuleLabel5000.txt",'w') i=0 for moduleName in mc.moduleNameList: file_open.write("%s,%d\n"%(moduleName,labels[i])) i+=1 file_open.close() ctkm = Convolution_Tree_Kernel_Mutation(moduleLabelDict) begin=time.time() distance_matrix=dm.compute([x for j in pp.allTrees for x in filter(ft,j)],ctkm) end=time.time() log.getlog().debug("Distance computation took %f seconds." % (end - begin)) file_open=open("Distancenew5000.txt",'w') for r,row in enumerate(distance_matrix): for c,col in enumerate(row): file_open.write("%.6f "%(col)) file_open.write("\n") file_open.close() ctkm = Convolution_Tree_Kernel_Mutation(moduleLabelDict) print("test begin!") eps_list = [] cluster_num_list = [] noise_num_list = [] for epsNumber in range(100): eps=float((epsNumber+1)/100) tc=TreeCluster(eps=eps,min_samples=5,metric="precomputed",n_jobs=4) tc.Train(distance_matrix) eps_list.append(eps) cluster_num_list.append(tc.clusterNumber) noise_num_list.append(tc.noiseNumber) #V.cluster_result_linechart(eps_list,cluster_num_list,noise_num_list) tc=TreeCluster(eps=0.5,n_jobs=4) begin=time.time() tc.Train(distance_matrix) end=time.time() log.getlog().debug("Clustering took %f seconds." % (end - begin)) V.cluster_result_linechart(range(2000),tc.DB.labels_,tc.DB.labels_)
def plot_graph(self, data = None, iter = None, directory = "graph_plots\\"): # TODO: this should be generalized and added to Vizualize.py viz = Visualize() if data is not None: viz.do_plot( zip( *data[:iter] ), color = 'y', marker = '.') # viz.do_plot( zip( *data[:iter] ), color = self.data.Y[:iter], marker = '.') viz.do_plot( zip( *self.get_nodes_positions() ), color = 'r', marker = 'o') for e in self.graph.edges: pos_head = e.head.data.pos pos_tail = e.tail.data.pos viz.do_plot( zip(* [pos_head, pos_tail] ) , color = 'r', marker='-') if not os.path.exists(directory): os.makedirs(directory) filename = str(time.time()) + '.png' if iter is None: viz.end_plot(fig = directory+'_'+filename) else: viz.end_plot(fig = directory+filename)
from ArtificialData import ArtificialData import itertools import os import math import random import datetime import time import warnings import numpy as np import matplotlib.pyplot as plt2 # ================================================================= if __name__ == '__main__': warnings.simplefilter(action="ignore", category=FutureWarning) random.seed(1234) viz = Visualize() # ----------------------------- # agg = 0 # Always calm # agg = 1 # Always aggressive agg = None # Mix of calm periods and aggressive periods # Ks = [3, 6, 8, 10] # Clusters # Ds = [5, 10, 15, 30, 60, 90, 120] # Duration window Ks = [3] # Clusters Ds = [5] # Ds = [.5, 1, 5, 15, 60] # Duration window Ps = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.] # Patterns similarity (difficulty) Ns = [3.] # Noise level MODE_NAMES = ["b", "g", "r"]
features_combinations = range(2, len(DATA[0])) combos=[]; qualitiesFSP=[]; qualitiesSSP=[] for id_combin, n_features in enumerate( features_combinations ): clust = Clustering(DATA, scale=True, features=None).gmm(k=gb.K) # kmeans, dpgmm # clust = Clustering(DATA, scale=True, features=n_features).gmm(k=gb.K) app.init_clust_tracker(clust, AXES_INFO) PLOT_PATH = gb.PLOT_PATH + str(id_combin) + '/' if not os.path.exists(PLOT_PATH): os.makedirs(PLOT_PATH) path = PLOT_PATH+str(id_combin)+'_' app.logInformations( id_combin=id_combin, clust=clust, path=path ) qualityFSP, qualitySSP = app.tracking(path=path) combos.append( id_combin ) qualitiesFSP.append(qualityFSP) qualitiesSSP.append(qualitySSP) break print("qualitiesFSP/qualitiesSSP", list(zip(qualitiesFSP, qualitiesSSP))) Visualize().plot( [combos, qualitiesFSP], axs_labels=['Combination (over features)', 'Quality'], marker="-", label="id_combin="+str(id_combin), fig="plots/quality-combos-"+str(time.time())+".png" ) Visualize().plot( [combos, qualitiesSSP], axs_labels=['Combination (over features)', 'qualitySS'], marker="-", label="id_combin="+str(id_combin), fig="plots/qualitySS-combos-"+str(time.time())+".png" ) # ----------------------------- map(lambda sr: sr.closeDB(), sigReaders) print("FINISH.") input()
def plot_graph(self, data = None, iter = None, directory = "graph_plots\\"): # TODO: this should be generalized and added to Vizualize.py viz = Visualize() if data is not None: viz.do_plot( zip( *data[:iter] )[:3], color = 'y', marker = '.') # viz.do_plot( zip( *data[:iter] )[:3], color = self.data.Y[:iter], marker = '.') viz.do_plot( zip( *self.get_nodes_positions() ), color = 'r', marker = 'o') dis_avg = np.mean([ distance.euclidean(edg.head.data.pos, edg.tail.data.pos) for edg in self.gng.graph.edges ]) dis_std = np.std([ distance.euclidean(edg.head.data.pos, edg.tail.data.pos) for edg in self.gng.graph.edges ]) for e in self.gng.graph.edges: if distance.euclidean(e.head.data.pos, e.tail.data.pos) - dis_avg > 1. * dis_std: viz.do_plot( zip(* [e.head.data.pos, e.tail.data.pos] ) , color = 'y', marker='-') else: viz.do_plot( zip(* [e.head.data.pos, e.tail.data.pos] ) , color = 'r', marker='-') if not os.path.exists(directory): os.makedirs(directory) filename = str(time.time()) + '.png' if iter is None: viz.end_plot(fig = directory+'_'+filename) else: viz.end_plot(fig = directory+filename)
def main(): stop_words = get_stop_words(STOP_WORDS_PATH) data = Initialize_Data() visualizer = Visualize() data.initialize_twitter_posts(TWITTER_POSTS_CSV, TWITTER_DATA_DIR) data.initialize_facebook_posts(FACEBOOK_POSTS_CSV, FACEBOOK_DATA_DIR) # Cleanup posts text_Cleanuper = Posts_Cleansing(data) text_Cleanuper.cleanup(Text_Cleanuper()) # Divide data into test and train set X_train, X_test, Y_train, Y_test = train_test_split(data.posts, data.labels, test_size=0.2, random_state=40) # Bag of Words model vectorization bag_of_words_model = Bag_Of_Words(X_train) bag_of_words_model.build_vectorizer(stop_words) X_train_counts = bag_of_words_model.data_counts X_test_counts = bag_of_words_model.vectorizer.transform(X_test) forest = RandomForestClassifier(n_estimators=100) forest = forest.fit(X_train_counts, Y_train) y_predicted_counts_train = forest.predict(X_train_counts) accuracy, precision, recall, f1 = get_metrics(Y_train, y_predicted_counts_train) print("Train accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1)) y_predicted_counts = forest.predict(X_test_counts) accuracy, precision, recall, f1 = get_metrics(Y_test, y_predicted_counts) print("Test accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1)) # Find best hyperparams # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } # First create the model to tune rf = RandomForestClassifier() rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1) # Fit the random search model rf_random.fit(X_train_counts, Y_train) print('Get Best Params') print(rf_random.best_params_) print('Saving model') save_model(rf_random, RANDOM_FOREST_MODEL_PATH) print('Load model') trained_model = load_model(RANDOM_FOREST_MODEL_PATH) y_predicted_counts_train = trained_model.predict(X_train_counts) accuracy, precision, recall, f1 = get_metrics(Y_train, y_predicted_counts_train) print( "Train accuracy = %.3f, precisionս = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1)) y_predicted_counts = trained_model.predict(X_test_counts) accuracy, precision, recall, f1 = get_metrics(Y_test, y_predicted_counts) print("Test accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
def plot_graph(self, data = None, iter = None, dir = "graph_plots\\"): if len(self.graph.edges) < 1: return if not os.path.exists(dir): os.makedirs(dir) viz = Visualize(); viz2 = Visualize() self.separate() H, C = self.get_templates() hs = [ self.edgeHist(e) for e in self.graph.edges ] COLOR = self.group_by_ref(hs, H, C) carac_edges = [ self.edge_features(e) for i, e in enumerate(self.graph.edges) ] # carac_edges = [ self.edge_features(e) + [1. if COLOR[i]=='r' else 0.] for i, e in enumerate(self.graph.edges) ] # viz2.MDS_Plot( zip(*hs), dim=2, fig=dir+str(time.time())+'--.png', marker='o', color=COLOR ) # viz2.MDS_Plot( zip(*carac_edges), dim=2, fig=dir+str(time.time())+'__.png', marker='o', color=COLOR ) # viz2.plot( zip(*carac_edges), fig=dir+str(time.time())+'__.png', marker='o', color=COLOR ) ''' for ih, h in enumerate(H): plt.bar(range(len(h)), h, color=C[ih]); plt.savefig(dir+str(time.time())+'.png'); plt.close() for ie, e in enumerate(self.graph.edges): c = COLOR[ie] plt.title('-'.join(str(e) for e in e.head.data.pos)); plt.bar(range(len(hs[ie])), hs[ie], color=c); plt.savefig(dir+str(time.time())+'.png'); plt.close() ''' # ''' if data is not None: viz.do_plot( zip( *data[:iter] ), color = 'y', marker = '.') matures = [node.data.pos for node in self.graph.nodes if node.data.age > self.mature_age ] if len(matures) > 0: viz.do_plot( zip( *matures ), color = 'r', marker = 'o') embryon = [node.data.pos for node in self.graph.nodes if node.data.age <= self.mature_age ] if len(embryon) > 0: viz.do_plot( zip( *embryon ), color = 'g', marker = 'o') low_dens = [ node.data.pos for node in self.graph.nodes if not self.isDenseNode(node) ] if len(low_dens) > 0: viz.do_plot( zip( *low_dens ), color = 'b', marker = 'o') for ie, e in enumerate(self.graph.edges): # if e.head.data.age > self.mature_age and e.tail.data.age > self.mature_age: # if self.isDenseNode(e.head) and self.isDenseNode(e.tail): c = COLOR[ie] # c = 'r' if self.isDenseEdgeMean(e) else 'y' # c = 'r' if self.isDenseEdgeMean(e) else COLOR[ie] # c = 'r' if self.isDenseEdgeMean(e) or e.head.data.label == e.tail.data.label else COLOR[ie] viz.do_plot( zip(* [e.head.data.pos, e.tail.data.pos] ) , color = c, marker='-', lw = 1) filename = str(time.time()) + '.png' if iter is None: viz.end_plot(fig = dir+'_'+filename) else: viz.end_plot(fig = dir+filename)
from Maze import Maze from Visualize import Visualize from MazeSolverUCS import MazeSolverUCS from MazeSolverAStar import MazeSolverAStar from MazeSolverIDDFS import MazeSolverIDDFS SIZE = 100 maze = Maze(SIZE) maze.createMaze() pathma, op = MazeSolverAStar(maze, 0).aStar() #Manhattan pathea, _ = MazeSolverAStar(maze, 1).aStar() #Euclidian pathucs = MazeSolverUCS(maze).UCS() #Uniform cost search pathi = MazeSolverIDDFS(maze).IDDFS() #Iterative deepening search Visualize(maze, None, "Empty Maze").visualizeMaze() #Empty maze Visualize(maze, op, "Optimal Path Maze").visualizeMazeAStar() #Optimum path Visualize(maze, pathma, "AStar with Manhattan Heuristic").visualizeMazeAStar( ) #the maze with a path which is used by a*(manhattan) Visualize(maze, pathea, "AStar with Euclidian Heuristic").visualizeMazeAStar( ) #the maze with a path which is used by a*(euclidian) Visualize(maze, pathucs, "UCS").visualizeMaze( ) #the maze with a path which is used by uniform cost search Visualize(maze, pathi, "IDDFS").visualizeIDDFS( ) #the maze with a path which is used by iterative deeping search
from ArtificialData import ArtificialData import itertools import os import math import random import datetime import time import warnings import numpy as np import matplotlib.pyplot as plt2 # ================================================================= if __name__ == '__main__': warnings.simplefilter(action = "ignore", category = FutureWarning) random.seed(1234) viz = Visualize() # ----------------------------- # agg = 0 # Always calm # agg = 1 # Always aggressive agg = None # Mix of calm periods and aggressive periods # Ks = [3, 6, 8, 10] # Clusters # Ds = [5, 10, 15, 30, 60, 90, 120] # Duration window Ks = [3] # Clusters Ds = [5] # Ds = [.5, 1, 5, 15, 60] # Duration window Ps = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.] # Patterns similarity (difficulty) Ns = [3.] # Noise level MODE_NAMES = ["b","g", "r"]
def plot_graph(self, data = None, iter = None, directory = "graph_plots\\"): # TODO: this should be generalized and added to Vizualize.py if len(self.graph.edges) < 1: return viz = Visualize() if data is not None: viz.do_plot( zip( *data[:iter] ), color = 'y', marker = '.') viz.do_plot( zip( *data[max(0,iter-400):iter] ), color = 'y', marker = '.') matures = [node.data.pos for node in self.graph.nodes if node.data.age > self.mature_age ] embryon = [node.data.pos for node in self.graph.nodes if node.data.age <= self.mature_age ] if len(matures) > 0: viz.do_plot( zip( *matures ), color = 'r', marker = 'o') if len(embryon) > 0: viz.do_plot( zip( *embryon ), color = 'g', marker = 'o') # dis_avg = np.mean([ distance.euclidean(edg.head.data.pos, edg.tail.data.pos) for edg in self.graph.edges ]) # dis_std = np.std([ distance.euclidean(edg.head.data.pos, edg.tail.data.pos) for edg in self.graph.edges ]) for e in self.graph.edges: # if e.head.data.age > self.mature_age and e.tail.data.age > self.mature_age: viz.do_plot( zip(* [e.head.data.pos, e.tail.data.pos] ) , color = 'r', marker='-') if not os.path.exists(directory): os.makedirs(directory) filename = str(time.time()) + '.png' if iter is None: viz.end_plot(fig = directory+'_'+filename) else: viz.end_plot(fig = directory+filename)
import warnings import random from Mode import Mode import matplotlib.pyplot as plt #import matplotlib.pylab as plt import statistics as st import numpy as np import SignalReaderArtificial import app import Clustering import globals as gb if __name__ == '__main__': warnings.simplefilter(action="ignore", category=FutureWarning) random.seed(1234) viz = Visualize() modesNum = [2] # Clusters duration = [60] # Duration window patternOverlap = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9, 1.] # Patterns similarity (difficulty) noise = [3.] # Noise level countryside_lowerLimit = 40 #50 countryside_higherLimit = 40 #75 countryside_waveLength = 100 countryside_noise = 2 highway_lowerLimit = 70 highway_higherLimit = 70 highway_waveLength = 100
def plot_graph(self, data = None, iter = None, directory = "graph_plots\\"): # TODO: this should be generalized and added to Vizualize.py viz = Visualize() colors = ['r', 'b', 'k', 'g', 'm', 'c']*1000 # FIXME if data is not None: viz.do_plot( zip( *data[:iter] ), color = 'y', marker = '.') # viz.do_plot( zip( *data[:iter] ), color = self.data.Y[:iter], marker = '.') labels=set([n.data.label for n in self.graph.nodes]) d = {l: [n for n in self.graph.nodes if n.data.label == l] for l in labels} for ico,label in enumerate(d): viz.do_plot( zip( *[n.data.pos for n in d[label]] ), color = colors[ico], marker = 'o') for node in d[label]: node_links = [[node.data.pos, n.data.pos] for n in node.neighbors()] for nl in node_links: viz.do_plot( zip( *nl ), color = colors[ico], marker = '-') dis_avg = np.mean([ distance.euclidean(edg.head.data.pos, edg.tail.data.pos) for edg in self.graph.edges ]) dis_std = np.std([ distance.euclidean(edg.head.data.pos, edg.tail.data.pos) for edg in self.graph.edges ]) for e in self.graph.edges: if e.head.data.label != e.tail.data.label: if distance.euclidean(e.head.data.pos, e.tail.data.pos) - dis_avg > 1. * dis_std: viz.do_plot( zip(* [e.head.data.pos, e.tail.data.pos] ) , color = 'w', marker='-') if not os.path.exists(directory): os.makedirs(directory) filename = str(time.time()) + '.png' if iter is None: viz.end_plot(fig = directory+'_'+filename) else: viz.end_plot(fig = directory+filename)
def main(): stop_words = get_stop_words(STOP_WORDS_PATH) data = Initialize_Data() visualizer = Visualize() data.initialize_twitter_posts(TWITTER_POSTS_CSV, TWITTER_DATA_DIR) data.initialize_facebook_posts(FACEBOOK_POSTS_CSV, FACEBOOK_DATA_DIR) # Cleanup posts text_Cleanuper = Posts_Cleansing(data) text_Cleanuper.cleanup(Text_Cleanuper()) tokenidez_list = get_labeled_list(data.posts, data.labels, stop_words) # Divide data into test and train set X_train, X_test, Y_train, Y_test = train_test_split(data.posts, data.labels, test_size=0.2, random_state=40) # Bag of Words model vectorization bag_of_words_model = Bag_Of_Words(X_train) bag_of_words_model.build_vectorizer(stop_words) X_train_counts = bag_of_words_model.data_counts X_test_counts = bag_of_words_model.vectorizer.transform(X_test) # Visualize vectorized data visualizer.plot_vectorized_data(X_train_counts, np.array(Y_train) == 'positive') # Logistic Regression model clf = LogisticRegression(C=1.0, class_weight='balanced', solver='liblinear', multi_class='ovr', n_jobs=-1, random_state=40) clf.fit(X_train_counts, Y_train) print('Saving model') save_model(clf, LOG_REG_MODEL_PATH) print('Load model') trained_model = load_model(LOG_REG_MODEL_PATH) # Predict on text labels y_predicted_counts = trained_model.predict(X_test_counts) # Get model scorce accuracy, precision, recall, f1 = get_metrics(Y_test, y_predicted_counts) print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1)) # Print words importance matrix importance = get_most_important_features( bag_of_words_model.vectorizer.vocabulary_.items(), trained_model, 10) # Visualize important features top_scores = [a[0] for a in importance[0]['tops']] top_words = [a[1] for a in importance[0]['tops']] bottom_scores = [a[0] for a in importance[0]['bottom']] bottom_words = [a[1] for a in importance[0]['bottom']] visualizer.plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words for relevance") # Word2Vec vectorization word2vecVectorization = Word_2_Vec(WORD_VEC_FILE) word2vecVectorization.build_vectorizer(tokenidez_list) X_train_word2vec, X_test_word2vec, Y_train_word2vec, Y_test_word2vec = train_test_split( word2vecVectorization.embeddings, data.labels, test_size=0.2, random_state=40) # Visualize data visualizer.plot_vectorized_data(word2vecVectorization.embeddings, np.array(data.labels) == 'positive') clf_w2v = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial', random_state=40) clf_w2v.fit(X_train_word2vec, Y_train_word2vec) print('Saving model') save_model(clf_w2v, LOG_REG_MODEL_PATH) print('Load model') clf_w2v = load_model(LOG_REG_MODEL_PATH) Y_predicted_word2vec = clf_w2v.predict(X_test_word2vec) accuracy_word2vec, precision_word2vec, recall_word2vec, f1_word2vec = get_metrics( Y_test_word2vec, Y_predicted_word2vec) print( "accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_word2vec, precision_word2vec, recall_word2vec, f1_word2vec)) testLabel = "Խնդիրը վատ է գրված" word2vecVectorization.build_vectorizer([{ "tokens": get_tokens_without_stop_words(testLabel.split(" "), stop_words) }]) Y_predicted_word2vec = clf_w2v.predict(word2vecVectorization.embeddings) print(Y_predicted_word2vec)
def train(self, mtd = "margin", backupfile = "backupfile.txt"): for i, x in enumerate(self.Ux): y1 = self.clf.predict_label(x) if mtd == "supervised": informativeness = sys.float_info.max if mtd == "margin": informativeness = self.clf.uncertainty_margin(x) # =============================== id_th = self.mab.choose() self.th = self.mab.algos[ id_th ] print "Choosen =", self.th, "nb_choices =", self.mab.nb_choices, ("avg rwd=", [ np.mean(L) for L in self.mab.rewards ] if self.mab.rewards[0]!=[] else " "), "expected=", sum([ a*l for a,l in zip(self.mab.algos,self.mab.nb_choices) ]) / sum(self.mab.nb_choices) prev_clf = Classification(self.Lx, self.Ly, method = self.clf.method) prev_clf.GAMMA, prev_clf.C = self.clf.GAMMA, self.clf.C; prev_clf.train() # =============================== # avg_rewards = [ np.mean(L[:-20]) if len(L)>0 else 1. for L in self.mab.rewards ] # self.th = sum([ a*l for a,l in zip(self.mab.algos,avg_rewards) ]) / sum(avg_rewards) # print "Choosen =", self.th, "avg rwd=", avg_rewards # =============================== if informativeness > self.th: qx = x qy = self.Uy[i] self.Lx.append(qx) self.Ly.append(qy) self.queried += 1 self.clf.X = self.Lx; self.clf.Y = self.Ly; self.clf.train() # =============================== reward = 1. - abs( 0.1 - self.queried / (i+1.) ) self.mab.update(id_th, reward) # =============================== # for idt in range(len(self.mab.algos)): # reward = 1. - abs( 0.3 - (self.queried-1+1) / (i+1.) ) if informativeness > self.mab.algos[idt] else 1. - abs( 0.4 - (self.queried-1) / (i+1.) ) # self.mab.update(idt, reward) # =============================== self.ths.append( self.th ) self.infos.append( informativeness ) self.accuracys.append( self.clf.getTestAccuracy( self.Tx, self.Ty ) ) self.queries.append( self.queried ) self.sup_infos.append( self.sup_clf.uncertainty_margin(x) ) # TODO should not be here self.sup_clf.X = self.Lx0+self.Ux[:i+1]; self.sup_clf.Y = self.Ly0+self.Uy[:i+1]; self.sup_clf.train() # TODO should not be here self.sup_accuracys.append( self.sup_clf.getTestAccuracy( self.Tx, self.Ty ) ) # TODO should not be here ''' if i>10: # last_infos = self.infos[-100:] if len(self.infos) > 100 else self.infos[:] # self.th = np.mean( last_infos ) if informativeness > self.th: # queried if y1 == qy: # but was correctly predicted self.th = self.th + 0.1 * (informativeness - self.th) else: if y1 != qy: self.th = self.th - 0.1 * (self.th - informativeness ) ''' print "i=", i+1, self.queried, self.queried / (i+1.), "-- acc=%.4f"%(self.accuracys[-1]*100), "%.4f"%(self.sup_accuracys[-1]*100), "-- %.4f"%(np.mean(self.accuracys)*100), "%.4f"%(np.average(self.accuracys, weights = range(1,1+len(self.accuracys)))*100), "--", informativeness if (i+1)%10 == 0: Util.pickleSave(backupfile, self); viz = Visualize() viz.do_plot( [range(len(self.infos)), self.ths], color = 'b', marker = '-' ) viz.do_plot( [range(len(self.infos)), self.infos], color = 'r', marker = '-' ) viz.do_plot( [range(len(self.sup_infos)), self.sup_infos], color = 'y', marker = '-' ) viz.end_plot( fig = backupfile+"_stream_inf.png" ) viz.do_plot( [range(len(self.accuracys)), self.accuracys], color = 'r', marker = '-' ) viz.do_plot( [range(len(self.sup_accuracys)), self.sup_accuracys], color = 'y', marker = '-' ) viz.end_plot( fig = backupfile+"_stream_acc.png" ) viz.do_plot( [range(len(self.queries)), self.queries], color = 'r', marker = '-' ) viz.do_plot( [range(len(self.queries)), range(len(self.queries))], color = 'y', marker = '-' ) viz.end_plot( fig = backupfile+"_stream_lab.png" ) '''
from part2 import part2 from part3 import part3 from part4 import part4 from part5 import part5 # ----------------------------------------------------------------------------- # Load datasets from ass 1 (unscaled) # ----------------------------------------------------------------------------- datasets = LoadPreprocessDataset() # ----------------------------------------------------------------------------- # Visualize DataSet - Scatter Matrix # ----------------------------------------------------------------------------- if 1: # set to 1 to enable print(82 * '_') Visualize(datasets['wifi']) print(82 * '_') Visualize(datasets['letter']) # ----------------------------------------------------------------------------- # PART 1 - Run EM and K-means on two datasets # ----------------------------------------------------------------------------- if 1: # set to 1 to enable print(82 * '_') print("PART 1a - Run K-means on two datasets") print(82 * '_') print() part1_km(datasets['wifi']) print(82 * '_') print() part1_km(datasets['letter'])
import random import numpy as np from Data import Data from ActiveLearning import ActiveLearning import Util from Visualize import Visualize #----------------------------------- if __name__ == "__main__": random.seed( 12345 ) #----------------------------------- viz = Visualize() colors = ['y','c','m','b','g','k','r'] # datasetname = "optdigits" datasetname = "pendigits" # datasetname = "CNAE9" data = Data( source_file = datasetname ) print "nb data points:", len(data.X), "nb features in data:", data.nb_features #----------------------------------- ''' opt = {} opt["random"] = Util.pickleLoad('___AL Results\\optdigits\\random\\_optdigits.random.50.opt-10-margin.txt') # opt["entropy"] = Util.pickleLoad('___AL Results\\optdigits\\entropy\\_optdigits.entropy.50.opt-10-margin.txt') # opt["margin"] = Util.pickleLoad('___AL Results\\optdigits\\margin\\_optdigits.margin.50.opt-10-margin.txt') opt["proba"] = Util.pickleLoad('___AL Results\\optdigits\\proba\\_optdigits.proba.50.opt-10-margin.txt') # opt["weight"] = Util.pickleLoad('___AL Results\\optdigits\\weight\\_optdigits.weight.50.opt-20-entropy.txt') opt["etc_"] = Util.pickleLoad('___AL Results\\optdigits\\etc\\_optdigits.etc_.50.opt-20-margin.txt') opt["etc"] = Util.pickleLoad('___AL Results\\optdigits\\etc\\_optdigits.etc.50.opt-20-margin.txt')
def __init__(self, data): self.data = data self.viz = Visualize()
def sortForInformativeness(self, mtd): if mtd in ["etc", "etc_", "expectedErrorReduction", "weight", "optimal", "test", "intuition"] : ids, scores = self.sortForInformativeness(self.optimization_method) scores = [] for ix, x in enumerate(self.Ux): y1, y2, p1, p2 = self.clf.getMarginInfo(x) if mtd == "intuitionM": if ix in ids[:self.optimization_limit]: informativeness = self.clf.uncertainty_margin(x) else: informativeness = 0. #--------------------------------------------------------- if mtd == "margin": informativeness = self.clf.uncertainty_margin(x) #--------------------------------------------------------- elif mtd == "proba": informativeness = self.clf.uncertainty_prediction(x) #--------------------------------------------------------- elif mtd == "entropy": informativeness = self.clf.uncertainty_entropy(x) #--------------------------------------------------------- elif mtd == "random": informativeness = random.uniform(0., 1.) #--------------------------------------------------------- elif mtd == "weight": if ix in ids[:self.optimization_limit]: informativeness = self.clf.uncertainty_weight(x, self.Lx, self.Ly) else: informativeness = 0. #--------------------------------------------------------- elif mtd == "expectedErrorReduction": if ix in ids[:self.optimization_limit]: sums = 0. YP = self.clf.predict(x, all = True) YP.sort(key=operator.itemgetter(1), reverse=True) for ir, (yy, proba) in enumerate(YP): if ir == 5: break temp_clf = Classification(self.Lx + [x], self.Ly + [yy], method = self.clf.method); temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C # TODO FIXME: do it in general not specifically for svm temp_clf.train() e_h1 = sum( [ temp_clf.uncertainty_entropy(dp) for dp in self.Ux if dp != x ] ) sums += (proba) * e_h1 informativeness = 1. / sums else: informativeness = 0. #--------------------------------------------------------- elif mtd == "etc": if ix in ids[:self.optimization_limit]: temp_clf1 = Classification(self.Lx + [x], self.Ly + [y1], method = self.clf.method); temp_clf1.GAMMA, temp_clf1.C = self.clf.GAMMA, self.clf.C temp_clf1.train() diff1 = sum( [ abs(temp_clf1.getPredictProba(1,dp) - self.clf.getPredictProba(1,dp)) if temp_clf1.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ] ) / (len(self.Ux) - 1.) temp_clf2 = Classification(self.Lx + [x], self.Ly + [y2], method = self.clf.method); temp_clf2.GAMMA, temp_clf2.C = self.clf.GAMMA, self.clf.C temp_clf2.train() diff2 = sum( [ abs(temp_clf2.getPredictProba(1,dp) - self.clf.getPredictProba(1,dp)) if temp_clf2.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ] ) / (len(self.Ux) - 1.) informativeness = diff1 # this one is particularly good for rejection (to be confirmed) informativeness = diff1 if p1/(1+diff1) >= p2/(1+diff2) else diff2 informativeness = p1*diff1 + p2*diff2 + 1. else: informativeness = 0. #--------------------------------------------------------- elif mtd == "etc_": if ix in ids[:self.optimization_limit]: temp_clf1 = Classification(self.Lx + [x], self.Ly + [y1], method = self.clf.method); temp_clf1.GAMMA, temp_clf1.C = self.clf.GAMMA, self.clf.C temp_clf1.train() diff1 = sum( [ 1. if temp_clf1.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ] ) / (len(self.Ux) - 1.) temp_clf2 = Classification(self.Lx + [x], self.Ly + [y2], method = self.clf.method); temp_clf2.GAMMA, temp_clf2.C = self.clf.GAMMA, self.clf.C temp_clf2.train() diff2 = sum( [ 1. if temp_clf2.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ] ) / (len(self.Ux) - 1.) informativeness = diff1 # this one is particularly good for rejection (to be confirmed) informativeness = diff1 if p1/(1+diff1) >= p2/(1+diff2) else diff2 informativeness = p1*diff1 + p2*diff2 + 1. else: informativeness = 0. #--------------------------------------------------------- elif mtd == "test": if ix in ids[:self.optimization_limit]: temp_clf1 = Classification(self.Lx + [x], self.Ly + [y1], method = self.clf.method); temp_clf1.GAMMA, temp_clf1.C = self.clf.GAMMA, self.clf.C temp_clf1.train() diff1 = np.mean( [0.]+[ abs(temp_clf1.getPredictProba(1,dp) - self.clf.getPredictProba(1,dp)) for dp in self.Ux if temp_clf1.predict_label(dp) != self.clf.predict_label(dp) and dp != x ] ) temp_clf2 = Classification(self.Lx + [x], self.Ly + [y2], method = self.clf.method); temp_clf2.GAMMA, temp_clf2.C = self.clf.GAMMA, self.clf.C temp_clf2.train() diff2 = np.mean( [0.]+[ abs(temp_clf2.getPredictProba(1,dp) - self.clf.getPredictProba(1,dp)) for dp in self.Ux if temp_clf2.predict_label(dp) != self.clf.predict_label(dp) and dp != x ] ) informativeness = diff1 # this one is particularly good for rejection (to be confirmed) informativeness = diff1 if p1/(1+diff1) >= p2/(1+diff2) else diff2 informativeness = p1*diff1 + p2*diff2 + 1. else: informativeness = 0. #--------------------------------------------------------- elif mtd == "intuition": if ix in ids[:self.optimization_limit]: true_y = self.Uy[ self.Ux.index(x) ] temp_clf = Classification(self.Lx + [x], self.Ly + [true_y], method = self.clf.method) temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C; temp_clf.train() ucts = [ temp_clf.getPredictProba(1,dp) - self.clf.getPredictProba(1,dp) for dp in self.Tx ] ids_ucts = (-np.array(ucts)).argsort()[:50] # diff = np.mean( [ 1. if temp_clf.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Tx ] ) diff = np.mean([1. if temp_clf.predict_label(dp) != self.clf.predict_label(dp) and idp in ids_ucts else 0. for idp,dp in enumerate(self.Tx) ]) informativeness = diff else: informativeness = 0. #--------------------------------------------------------- elif mtd == "intuition": if ix in ids[:self.optimization_limit]: true_y = self.Uy[ self.Ux.index(x) ] temp_clf = Classification(self.Lx + [x], self.Ly + [true_y], method = self.clf.method) temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C; temp_clf.train() # --------------------- imp_x = [ xdp for xdp in self.Tx if temp_clf.predict_label(xdp) != self.clf.predict_label(xdp) ] imp_y_hh = [ temp_clf.predict_label(xdp) for xdp in self.Tx if temp_clf.predict_label(xdp) != self.clf.predict_label(xdp) ] if len( set(imp_y_hh) ) > 1: # hh = Classification(imp_x, imp_y_hh, method = self.clf.method) hh = Classification(imp_x + [x], imp_y_hh + [true_y], method = self.clf.method, tuning = False) hh.GAMMA, hh.C = self.clf.GAMMA, self.clf.C; hh.train() else: hh = self.clf # --------------------- h_inconsistant_truth = 0; hh_inconsistant_truth = 0; hh_inconsistant_h = 0; h_consistency = []; hh_consistency = [] for ilx, lx in enumerate(self.Lx): h_consistency.append( self.clf.getProbaOf( self.Ly[ilx], lx ) ) # hh_consistency.append( hh.getProbaOf( self.Ly[ilx], lx ) ) hh_consistency.append( hh.getProbaOf( self.Ly[ilx], lx ) if hh.predict_label(lx) == self.Ly[ilx] else 0. ) if self.clf.predict_label(lx) != self.Ly[ilx]: h_inconsistant_truth += 1. if hh.predict_label(lx) != self.Ly[ilx]: hh_inconsistant_truth += 1. if hh.predict_label(lx) != self.clf.predict_label(lx): hh_inconsistant_h += 1. h_consistency = np.mean(h_consistency) hh_consistency = np.mean(hh_consistency) if len( set(imp_y_hh) ) > 1 else 0. consistency_dif = hh_consistency - h_consistency # --------------------- diff = []; errors = 0.; trues = 0.; impacted = 0; impacted_probs = []; for idp, dp in enumerate(self.Tx): if temp_clf.predict_label(dp) != self.clf.predict_label(dp): ################## impacted += 1. impacted_probs.append( abs( temp_clf.getPredictProba(1,dp) - self.clf.getPredictProba(1,dp) ) ) if self.Ty[idp]!=temp_clf.predict_label(dp): errors += 1. else: trues += 1. # if temp_clf.predict_label(dp) != self.clf.predict_label(dp) and self.Ty[idp]==temp_clf.predict_label(dp): diff.append( 1. ) # if temp_clf.predict_label(dp) != self.clf.predict_label(dp) and trues - errors > 0: diff.append( 1. ) # if temp_clf.predict_label(dp) != self.clf.predict_label(dp): diff.append( 1. ) if temp_clf.predict_label(dp) != self.clf.predict_label(dp): diff.append( 1. ) else: diff.append( 0. ) diff = np.mean( diff ) # diff = diff * np.mean(impacted_probs) # seems to be working ... # --------------------- # self.viz_A.append( consistency_dif ) self.viz_A.append( hh_consistency ) self.viz_B.append( errors ) self.viz_C.append( trues ) self.viz_D.append( trues - errors ); posI = [inb for inb,nbD in enumerate(self.viz_D) if nbD >= 0.] self.viz_E.append( impacted ) self.viz_F.append( np.mean(impacted_probs) ) viz = Visualize(); viz.plot( [self.viz_A, self.viz_B], fig = "test_errors.png", color = 'r', marker = 'o' ) vizu = Visualize(); vizu.plot( [self.viz_A, self.viz_C], fig = "test_trues.png", color = 'r', marker = 'o' ) vizuu = Visualize(); vizuu.plot( [self.viz_A, self.viz_D], fig = "test_trues_errors.png", color = 'r', marker = 'o' ) vizuuu = Visualize(); vizuuu.do_plot( [self.viz_A, self.viz_E], color = 'r', marker = 'o' ) vizuuu.do_plot( [[self.viz_A[inb] for inb in posI], [self.viz_E[inb] for inb in posI]], color = 'b', marker = 'o' ) vizuuu.end_plot(fig = "impacted.png") print hh_consistency, hh_inconsistant_truth, "---", len(imp_x), len( set(imp_y_hh) ), "============>", impacted, trues - errors informativeness = diff else: informativeness = 0. #--------------------------------------------------------- scores.append( informativeness ) ids = (-np.array(scores)).argsort() sorted_scores = [ scores[id] for id in ids ] # sorted_scores = [ 1.*scores[id] / sum(scores) for id in ids ] return ids, sorted_scores
def plot_graph(self, data = None, iter = None, directory = "graph_plots\\"): # TODO: this should be generalized and added to Vizualize.py viz = Visualize() if data is not None: viz.do_plot( zip( *data[:iter] ), color = 'y', marker = '.') # viz.do_plot( zip( *data[:iter] ), color = self.data.Y[:iter], marker = '.') matures = [node.data.pos for node in self.graph.nodes if node.data.age > self.mature_age ] embryon = [node.data.pos for node in self.graph.nodes if node.data.age <= self.mature_age ] if len(matures) > 0: viz.do_plot( zip( *matures ), color = 'r', marker = 'o') if len(embryon) > 0: viz.do_plot( zip( *embryon ), color = 'y', marker = 'o') for e in self.graph.edges: pos_head = e.head.data.pos pos_tail = e.tail.data.pos viz.do_plot( zip(* [pos_head, pos_tail] ) , color = 'r', marker='-') if not os.path.exists(directory): os.makedirs(directory) filename = str(time.time()) + '.png' if iter is None: viz.end_plot(fig = directory+'_'+filename) else: viz.end_plot(fig = directory+filename)
def vizualize_buses( all_buses, dates_all_buses, dim = 2, path = "buses_viz/", m = '-' ): Util.mkdir(path) # ''' viz0 = Visualize(); viz1 = Visualize(); viz2 = Visualize(); viz3 = Visualize() c = Visualize.colors( len(all_buses) ) D = Util.flatList(all_buses) viz1.PCA_Plot( zip(*D), dim = dim, fig=path+"_Buses_All.png", color='b', marker = m ) X = viz1.PCA_Transform( zip(*D), dim = dim ) all_buses_transformed = [] for ib in range( len(all_buses) ): print ib+1, Xb = [ x for i,x in enumerate(X) if D[i] in all_buses[ib] ] all_buses_transformed.append( Xb ) viz0.do_plot( zip(*Xb), color = c[ib], marker = m ) viz1.plot( zip(*Xb), fig=path+"Bus"+str(ib)+".png", color = c[ib], marker = m ) viz0.end_plot(fig=path+"_Buses_All_c.png") # ''' window = 30; step = 10; window_t = datetime.timedelta(days = window); step_t = datetime.timedelta(days = step) t = datetime.datetime(year=2011, month=6, day=1) while t <= datetime.datetime(year=2015, month=9, day=1): viz2.do_plot( [[-0.39, 0.39], [-0.39, 0.39]], color='w' ) for ib, bus in enumerate(all_buses_transformed): # bus_tt = [x for ix,x in enumerate(bus) if ix < len(dates_all_buses[ib]) and dates_all_buses[ib][ix] > t and dates_all_buses[ib][ix] <= t+window_t] bus_tt = [x for ix,x in enumerate(bus) if ix < len(dates_all_buses[ib]) and dates_all_buses[ib][ix] <= t+window_t] if len( bus_tt ) > 0: viz2.do_plot( zip(* bus_tt ), color = c[ib], marker = m ) viz3.do_plot( [[-0.39, 0.39], [-0.39, 0.39]], color='w' ); viz3.do_plot( zip(* bus_tt ), color = c[ib], marker = m ); viz3.end_plot(fig=path+"Bus"+str(ib)+"_"+Util.date2str(t+window_t)+".png") viz2.end_plot(fig=path+"_Buses_"+Util.date2str(t+window_t)+".png") t += step_t '''