def ada_boost_experiment(): examples = ID3.data_parsing(FILE_PATH_TRAIN, numeric_cols) # hypothesis = AdaBoost.ada_boost(examples, 5, numeric_cols, missing_identifier) # print(hypothesis) iterations = 100 hypothesis = AdaBoost.ada_boost(examples, iterations, numeric_cols, missing_identifier) ada_results_train = AdaBoost.test_ada_boost_hypothesis( hypothesis, FILE_PATH_TRAIN, numeric_cols, missing_identifier) ada_results_test = AdaBoost.test_ada_boost_hypothesis( hypothesis, FILE_PATH_TEST, numeric_cols, missing_identifier) # for t in range(iterations): # print("AdaBoost Training Set - t:", t, "results:", ada_results_train[t], # "{0:.2%}".format(1-ada_results_train[t][0]/ada_results_train[t][1])) # for t in range(iterations): # print("AdaBoost Testing Set - t:", t, "results:", ada_results_test[t], # "{0:.2%}".format(1-ada_results_test[t][0]/ada_results_test[t][1])) # for t in range(iterations): # tree_results = ID3.test_tree(hypothesis[t][0],FILE_PATH_TRAIN, numeric_cols, missing_identifier) # print("Decision Tree Training Set - t:", t, "results:", tree_results, # "{0:.2%}".format(1 - tree_results[0] / tree_results[1])) # for t in range(iterations): # tree_results = ID3.test_tree(hypothesis[t][0],FILE_PATH_TEST, numeric_cols, missing_identifier) # print("Decision Tree Test Set - t:", t, "results:", tree_results, # "{0:.2%}".format(1 - tree_results[0] / tree_results[1])) ada_train = [] ada_test = [] dec_train = [] dec_test = [] for t in range(iterations): ada_train.append(1 - ada_results_train[t][0] / ada_results_train[t][1]) ada_test.append(1 - ada_results_test[t][0] / ada_results_test[t][1]) tree_results = ID3.test_tree(hypothesis[t][0], FILE_PATH_TRAIN, numeric_cols, missing_identifier) dec_train.append(1 - tree_results[0] / tree_results[1]) tree_results = ID3.test_tree(hypothesis[t][0], FILE_PATH_TEST, numeric_cols, missing_identifier) dec_test.append(1 - tree_results[0] / tree_results[1]) ada_graph = [ tuple([ada_train, "AdaBoost Train"]), tuple([ada_test, "AdaBoost Test"]) ] GraphUtility.graph(ada_graph, "AdaBoost Data", "Iterations", "Error") tree_graph = [ tuple([dec_train, "Tree Train"]), tuple([dec_test, "Tree Test"]) ] GraphUtility.graph(tree_graph, "Decision Tree Data", "Iterations", "Error")
def lms_experiment(): file_path_train = "/home/john/PycharmProjects/u1201441_Private_Repository/CS6350_Files/HW2/concrete/train.csv" file_path_test = "/home/john/PycharmProjects/u1201441_Private_Repository/CS6350_Files/HW2/concrete/test.csv" batch_size = -1 # -1 for full batch descent, or a batch size for stochastic descent. iterations = 1000 learning_constant = 1 plot_iter = 1000 results = [] for t in range(plot_iter): hypothesis = LeastMeanSquares.least_mean_squares( file_path_train, batch_size, iterations, learning_constant) if hypothesis[2]: break learning_constant = learning_constant / 2 r = hypothesis[0] hypotheses = hypothesis[1] for hyp in hypotheses: results.append(LeastMeanSquares.test_lms(hyp, file_path_test)) print("Gradient Descent - r:", r, "Weight:", hypotheses[-1], "Losses:", results) batch_size = 1 # -1 for full batch descent, or a small batch size for stochastic descent. results_stoch = [] for t in range(plot_iter): hypothesis = LeastMeanSquares.least_mean_squares( file_path_train, batch_size, iterations, learning_constant) if hypothesis[2]: break learning_constant = learning_constant / 2 r = hypothesis[0] hypotheses = hypothesis[1] for hyp in hypotheses: results_stoch.append(LeastMeanSquares.test_lms(hyp, file_path_test)) print("Stochastic Gradient Descent - r:", r, "Weight:", hypotheses[-1], "Losses:", results_stoch) lms_graph = [ tuple([results, "Descent"]), tuple([results_stoch, "Stochastic Descent"]) ] GraphUtility.graph(lms_graph, "LMS_Test", "Loss", "Gradient Descent Iterations")
def DegreeClassify(dataPath='../LinkAnalyticsData/UTK_problem/'): """ Finds the distrbtion of the cummulative node and edge wights """ filenames=('Moria_2','Standelf_2') attrs=('calls','texts','degree','secs') for f in filenames: # Reading in the Graph MG = GU.readData(os.path.join(dataPath,f)+'.graph') deg = nx.degree(MG)
def GetEdgeDistributions(dataPath='../LinkAnalyticsData/UTK_problem/'): filenames=('Moria_1.graph','Standelf_1.graph') attrs=('calls','texts','days','secs') for f in filenames: # Reading in the Graph MG = GU.readData(os.path.join(dataPath,f)) # Distribution of each Attribute for attr in attrs: data = GU.GetAttr(MG,attr) # Plotting pyplot.hist(data,100) pyplot.yscale('log') pyplot.grid(True) pyplot.ylabel("Frequency") pyplot.xlabel(attr) name = f.split('.')[0].split('_')[0] title = name+" "+attr+" distribution" pyplot.title(title) pyplot.savefig(name+"_"+attr+"_distribution.png") pyplot.clf()
def readData(filename='../LinkAnalyticsData/UTK_problem/Moria_1.graph'): """ Creates a dataset for ANN training of the formated data supplied by filename """ """ Currently based on 4x2 inputs of days, calls, call duration, and texts """ """ Two 'classes' are implemented, either there or not """ numInputs = 2+4+2 alldata = ClassificationDataSet(numInputs,1,nb_classes=2) MG = GU.readData(filename) closeness = nx.closeness_centrality(MG) degree = nx.degree(MG) startTime = datetime.now() # Computing the data data = [[closeness[u],degree[u],\ edata['calls'],edata['secs'],edata['texts'],edata['days'],\ degree[v],closeness[v]] \ for u,v,edata in MG.edges(data=True)] for d in data: alldata.addSample(d,[1]) print "Converted to data in ",(datetime.now()-startTime) return alldata
def GetDataDistributions(dataPath='../LinkAnalyticsData/UTK_problem/'): filenames=('Moria_1.graph','Standelf_1.graph') """ Finds the distrbtion of the cummulative node and edge wights """ attrs=('calls','texts','degree','secs') for f in filenames: # Reading in the Graph MG = GU.readData(os.path.join(dataPath,f)) g = GU.ConvertToSingle(MG) for attr in attrs: x = list() for n in g.nodes(): x.append(g.node[n][attr]) # Plotting the Data largest = heapq.nlargest(3,x) pyplot.figure() pyplot.hist(x,bins=np.logspace(1,np.log2(largest[2]),25,base=2)) pyplot.ylabel("Frequency") pyplot.xlabel(attr) name = f.split('.')[0].split('_')[0] title = name+" "+attr+" distribution" pyplot.title(title) pyplot.savefig(name+"_"+attr+"_cum_distribution.png")
def analyze(dirname,directory): start = TimeUtility.start() anu=an.Abbreviations() rootNode = Node(dirname) file_paths = [] # List which will store all of the full filepaths. fileFxnDictionary = {} fileImportDictionary = {} fileClassDictionary = {} callerFxnArgumentsDictionary={} callerCalleeFxn={} calleFxnArguments={} fileFxnCount={} fileImportCount={} fileClassCount={} uniqueImports=[] callerCalleePath=[] fxnGraph = nx.DiGraph() fxnGraphFull=nx.DiGraph() fxnList=[] classList=[] importList=[] for root, directories, files in os.walk(directory): # Walk the tree. for filename in files: str=filename.__str__() #print("root",root) filepath=root.replace("\\","/") # Join the two strings in order to form the full filepath. filepath=filepath+"/"+filename #filepath = os.path.join(root, filename) file_paths.append(filepath) # Add it to the list. #module = importlib.import_module(filepath) #my_class = getattr(module, 'MyClass') #my_instance = my_class() #dir() # #print("filepath : ",filepath," members are :",dir(module(filepath))) #print("filepath : ", filepath) if(not filepath.endswith(".py")): continue localFileNode = Node(filepath.replace(directory,""), parent=rootNode) file = open(filepath, "r+",encoding="utf8") variable=[] functionName=[] className=[] importModules=[] isNextWordfxn=False isNextWordclass=False isNextWordImport=False isNewWord=False classChildren=False fxnChildren=False mainFxnNode=None clsNode=None for word in file.read().split(): if(stopWordsRemoval.isStopWord(word) or word.__len__()<=2): continue if(isNextWordclass): if(word.__contains__("(")) : className.append(word) classChildren=True clsNode = Node("class:" + word, parent=localFileNode) isNextWordclass=False #print(word) elif(isNextWordfxn): print("fxnname",word) if("(" in word): arg=word.split("(")[1] word=word.split("(")[0] functionName.append(word.lower()) isNextWordfxn=False if(classChildren and (not fxnChildren)): mainFxnNode = Node("Fxn:" + word, parent=clsNode) elif(fxnChildren): fxnNode = Node("Fxn:" + word, parent=mainFxnNode) callerCalleeFxn.update({mainFxnNode.name,fxnChildren}) else: fxnNode = Node("Fxn:" + word, parent=localFileNode) fxnChildren = True #print(word) elif(isNextWordImport): importModules.append(word) isNextWordImport=False importNode = Node("Import:" + word, parent=localFileNode) #print(word) if (word == "def"): isNextWordfxn = True isNewWord=True fxnChildren = False #print("true") elif (word == "class"): isNextWordclass = True isNewWord=True fxnChildren=False elif (word == "import"): isNextWordImport = True isNewWord=True elif (checkWordForValidFunction(word) and fxnChildren): print("got new function lets see::::",word) if(fxnChildren and mainFxnNode is not None): fxnNode = Node("Fxn:" + word, parent=mainFxnNode) callerCalleeFxn[mainFxnNode.name]=[word] fxnGraph.add_edge(anu.get(su.sanitize(mainFxnNode.name)),anu.get(su.sanitize(word))) fxnGraphFull.add_edge(su.sanitize(mainFxnNode.name),su.sanitize(word)) # print("File:",filepath,"Functions:",functionName) #print("File:",filepath,"Classes:", className) #print("File:",filepath,"Import:", importModules) if(len(functionName) != 0): fileFxnDictionary.update({filepath.replace(directory,""):set(functionName)}) fileFxnCount[filepath.replace(directory,"")]=len(set(functionName)) if (len(className) != 0): fileClassDictionary.update({filepath.replace(directory,""): set(className)}) fileClassCount[filepath.replace(directory, "")]=len(set(className)) if (len(importModules) != 0): fileImportDictionary.update({filepath.replace(directory,""): set(importModules)}) fileImportCount[filepath.replace(directory, "")]= len(set(importModules)) uniqueImports.append(importModules) #print(len(fileFxnDictionary.values())) workbook = xlsxwriter.Workbook(dirname+"data"+".xlsx") workbook1 = xlsxwriter.Workbook(dirname+"function"+".xlsx") workbook2=xlsxwriter.Workbook(dirname+"count"+".xlsx") ExcelUtility.writeToExcel(callerCalleeFxn, "CallerCalleFxn", workbook1) ExcelUtility.writeToExcel(anu.shortNames, "FxnAbbre.", workbook1) ExcelUtility.writeToExcel(fileFxnDictionary,"functionInfo",workbook) ExcelUtility.writeToExcel(fileClassDictionary, "classInfo",workbook) ExcelUtility.writeToExcel(fileImportDictionary, "importInfo",workbook) ExcelUtility.writeToExcelCount(fileFxnCount, "fxncount", workbook2) ExcelUtility.writeToExcelCount(fileImportCount, "importcount", workbook2) ExcelUtility.writeToExcelCount(fileClassCount, "classcount", workbook2) dumpclean(callerCalleeFxn) print("tree:") PrintUtility.printTree(rootNode) print("Unique Imports are:",len(uniqueImports)) gu.getAllPaths(fxnGraph,True,dirname) gu.getAllPathsWithoutAbbreviations(fxnGraphFull,True,dirname) #visualize tha paths and get all the optimized paths of nodes that is this # function is calling this functioon and further on.. path1: f1 f2 f3 f4 f5 f6 f7 # the results will be saved in excel file as filename=dirname+'pathsoptimized.xlsx' all_paths=gu.getAllOptimizedPaths(fxnGraph, True,filename,dirname) gu.getAllOptimizedPathsWithoutAbbreviations(fxnGraphFull,True,dirname+"pathsoptimizedNoAbbre.xlsx",dirname) print("anu.counter:",anu.counter) callerMatrix=ceu.encodeValues(all_paths,anu.counter,dirname) pathMatrix=cs.getResultSimilarityMatrix(callerMatrix,dirname+"similarity.csv") cluster.test(dirname,pathMatrix) TimeUtility.end(start) #caller=callerMatrix() #caller.cleanInput(callerCalleeFxn) #stopWordsRemoval.removeStopwords('dataold.xlsx') #DeterMinePaths.determine(callerCalleeFxn) PrintUtility.printTree(rootNode) return rootNode,pathMatrix
def credit_experiment(): file_path = "/home/john/PycharmProjects/u1201441_Private_Repository/CS6350_Files/HW2/credit/default of credit card clients.csv" numeric_cols = [0, 4, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] missing_identifier = None training_data = [] test_data = [] data = ID3.data_parsing(file_path, numeric_cols) LABEL_INDEX = len(data[0]) - 2 for instance in data: if instance[LABEL_INDEX] == '1': instance[LABEL_INDEX] = "yes" else: instance[LABEL_INDEX] = "no" test_indices = random.sample(range(len(data)), len(data)) for i in test_indices: if i < 6000: test_data.append(data[i]) else: training_data.append(data[i]) iterations = 100 decision_tree = ID3.build_decision_tree( training_data, max_depth=-1, info_gain_type=1, numeric_cols=numeric_cols, missing_identifier=missing_identifier) adaboost = AdaBoost.ada_boost(training_data, iterations=iterations, numeric_cols=numeric_cols, missing_identifier=missing_identifier) bagged_tree = BaggedTrees.bagged_trees( training_data, iterations=iterations, sample_size=100, numeric_cols=numeric_cols, missing_identifier=missing_identifier) forest = RandomForest.random_forest(training_data, iterations=iterations, sample_size=100, numeric_cols=numeric_cols, missing_identifier=missing_identifier, feature_size=4) # Decision Tree results tree_results = ID3.test_tree(decision_tree, training_data, numeric_cols, missing_identifier) tree_train = 1 - tree_results[0] / tree_results[1] tree_results = ID3.test_tree(decision_tree, test_data, numeric_cols, missing_identifier) tree_test = 1 - tree_results[0] / tree_results[1] tree_train_ln = [] tree_test_ln = [] for t in range(iterations): tree_train_ln.append(tree_train) tree_test_ln.append(tree_test) # AdaBoost results ada_results_train = AdaBoost.test_ada_boost_hypothesis( adaboost, training_data, numeric_cols, missing_identifier) ada_results_test = AdaBoost.test_ada_boost_hypothesis( adaboost, test_data, numeric_cols, missing_identifier) ada_train = [] ada_test = [] for t in range(iterations): ada_train.append(1 - ada_results_train[t][0] / ada_results_train[t][1]) ada_test.append(1 - ada_results_test[t][0] / ada_results_test[t][1]) ada_graph = [ tuple([ada_train, "AdaBoost Train"]), tuple([ada_test, "AdaBoost Test"]), tuple([tree_train_ln, "Tree Train"]), tuple([tree_test_ln, "Tree Test"]) ] GraphUtility.graph(ada_graph, "AdaBoost Data", "Iterations", "Error") # Bagging results results_train = BaggedTrees.test_bagged_tree_hypothesis( bagged_tree, training_data, numeric_cols, missing_identifier) results_test = BaggedTrees.test_bagged_tree_hypothesis( bagged_tree, test_data, numeric_cols, missing_identifier) # Charts bag_train = [] bag_test = [] for t in range(iterations): bag_train.append(1 - results_train[t][0] / results_train[t][1]) bag_test.append(1 - results_test[t][0] / results_test[t][1]) bag_graph = [ tuple([bag_train, "Bagging Train"]), tuple([bag_test, "Bagging Test"]), tuple([tree_train_ln, "Tree Train"]), tuple([tree_test_ln, "Tree Test"]) ] GraphUtility.graph(bag_graph, "Bagged Tree Data", "Num Trees", "Error") # Forest Results results_train = RandomForest.test_random_forest_hypothesis( forest, training_data, numeric_cols, missing_identifier) results_test = RandomForest.test_random_forest_hypothesis( forest, test_data, numeric_cols, missing_identifier) # Charts forest_train = [] forest_test = [] for t in range(iterations): forest_train.append(1 - results_train[t][0] / results_train[t][1]) forest_test.append(1 - results_test[t][0] / results_test[t][1]) forest_graph = [ tuple([forest_train, "Forest Train - " + str(2) + " features"]), tuple([forest_test, "Forest Test - " + str(2) + " features"]), tuple([tree_train_ln, "Tree Train"]), tuple([tree_test_ln, "Tree Test"]) ] GraphUtility.graph(forest_graph, "Random Forest Data", "Num Trees", "Error")
def random_forest_experiment(): examples = ID3.data_parsing(FILE_PATH_TRAIN, numeric_cols) # hypothesis = AdaBoost.ada_boost(examples, 5, numeric_cols, missing_identifier) # print(hypothesis) LABEL_INDEX = len(examples[0]) - 2 iterations = 100 sample_size = int(len(examples) / 4) for feature_size in [2, 4, 6]: hypothesis = RandomForest.random_forest(examples, iterations, sample_size, numeric_cols, missing_identifier, feature_size) results_train = RandomForest.test_random_forest_hypothesis( hypothesis, FILE_PATH_TRAIN, numeric_cols, missing_identifier) results_test = RandomForest.test_random_forest_hypothesis( hypothesis, FILE_PATH_TEST, numeric_cols, missing_identifier) # Charts forest_train = [] forest_test = [] for t in range(iterations): # print("Random Forest -", "Feature Size:", feature_size, "t:", t, "results:", results[t], results[t][0]/results[t][1]) forest_train.append(1 - results_train[t][0] / results_train[t][1]) forest_test.append(1 - results_test[t][0] / results_test[t][1]) forest_graph = [ tuple([ forest_train, "Forest Train - " + str(feature_size) + " features" ]), tuple([ forest_test, "Forest Test - " + str(feature_size) + " features" ]) ] GraphUtility.graph(forest_graph, "Random Forest Data", "Num Trees", "Error") # Bias/Variance iterations = 100 forest = [] trees = [] sample_size = 100 feature_size = 2 for t in range(iterations): data = random.sample(examples, 1000) forest.append( RandomForest.random_forest(data, iterations, sample_size, numeric_cols, missing_identifier, feature_size)) trees.append(forest[t][0][0]) # Bias/Variance of individual trees. labels = [] biases = [] variances = [] for instance in examples: for tree in trees: label = ID3.get_label(tree, instance, numeric_cols, missing_identifier) labels.append(1 if label == "yes" else -1) if instance[LABEL_INDEX] == "yes": true_label = 1 else: true_label = -1 avg = numpy.average(labels) biases.append((avg - true_label)**2) variances.append(numpy.var(labels)) tree_bias = numpy.average(biases) tree_variance = numpy.average(variances) # Bias/Variance of bagged trees. labels = [] biases = [] variances = [] for instance in examples: for tree in forest: label = RandomForest.get_label(tree, instance, numeric_cols, missing_identifier) labels.append(1 if label == "yes" else -1) true_label = 1 if instance[LABEL_INDEX] == "yes" else -1 avg = numpy.average(labels) biases.append((avg - true_label)**2) variances.append(numpy.var(labels)) forest_bias = numpy.average(biases) forest_variance = numpy.average(variances) print("Tree Bias:", "{0:.3}".format(tree_bias), "Tree Variance:", "{0:.3}".format(tree_variance)) print("Forest Bias:", "{0:.3}".format(forest_bias), "Forest Variance:", "{0:.3}".format(forest_variance))
def bagged_trees_experiment(): examples = ID3.data_parsing(FILE_PATH_TRAIN, numeric_cols) # hypothesis = AdaBoost.ada_boost(examples, 5, numeric_cols, missing_identifier) # print(hypothesis) LABEL_INDEX = len(examples[0]) - 2 iterations = 100 sample_size = int(len(examples) / 2) hypothesis = BaggedTrees.bagged_trees(examples, iterations, sample_size, numeric_cols, missing_identifier) results_train = BaggedTrees.test_bagged_tree_hypothesis( hypothesis, FILE_PATH_TRAIN, numeric_cols, missing_identifier) results_test = BaggedTrees.test_bagged_tree_hypothesis( hypothesis, FILE_PATH_TEST, numeric_cols, missing_identifier) # for t in range(iterations): # print("Bagged Tree Training Set - t:", t, "results:", results_train[t], # "{0:.2%}".format(1-results_train[t][0]/results_train[t][1])) # for t in range(iterations): # print("Bagged Tree Test Set - t:", t, "results:", results_test[t], # "{0:.2%}".format(1-results_test[t][0]/results_test[t][1])) # Charts bag_train = [] bag_test = [] for t in range(iterations): bag_train.append(1 - results_train[t][0] / results_train[t][1]) bag_test.append(1 - results_test[t][0] / results_test[t][1]) bag_graph = [ tuple([bag_train, "Bagging Train"]), tuple([bag_test, "Bagging Test"]) ] GraphUtility.graph(bag_graph, "Bagged Tree Data", "Num Trees", "Error") # Bias/Variance calculations. iterations = 100 bagged_trees = [] trees = [] sample_size = 100 for t in range(iterations): data = random.sample(examples, 1000) bagged_trees.append( BaggedTrees.bagged_trees(data, iterations, sample_size, numeric_cols, missing_identifier)) trees.append(bagged_trees[t][0][0]) # Bias/Variance of individual trees. labels = [] biases = [] variances = [] for instance in examples: for tree in trees: label = ID3.get_label(tree, instance, numeric_cols, missing_identifier) labels.append(1 if label == "yes" else -1) if instance[LABEL_INDEX] == "yes": true_label = 1 else: true_label = -1 avg = numpy.average(labels) biases.append((avg - true_label)**2) variances.append(numpy.var(labels)) tree_bias = numpy.average(biases) tree_variance = numpy.average(variances) # Bias/Variance of bagged trees. labels = [] biases = [] variances = [] for instance in examples: for tree in bagged_trees: label = BaggedTrees.get_label(tree, instance, numeric_cols, missing_identifier) labels.append(1 if label == "yes" else -1) true_label = 1 if instance[LABEL_INDEX] == "yes" else -1 avg = numpy.average(labels) biases.append((avg - true_label)**2) variances.append(numpy.var(labels)) bag_bias = numpy.average(biases) bag_variance = numpy.average(variances) print("Tree Bias:", "{0:.3}".format(tree_bias), "Tree Variance:", "{0:.3}".format(tree_variance)) print("Bagged Bias:", "{0:.3}".format(bag_bias), "Bagged Variance:", "{0:.3}".format(bag_variance))
import networkx as nx import StringUtility as su import AbbreviatedNamesUtility as AN import GraphUtility as gu import OneHotEncodingUtility as heu fxnGraph = nx.DiGraph() anu = AN.Abbreviations() fxnGraph.add_edge(anu.get(su.sanitize("abc")), anu.get(su.sanitize("def"))) fxnGraph.add_edge(anu.get(su.sanitize("def")), anu.get(su.sanitize("ghj"))) fxnGraph.add_edge(anu.get(su.sanitize("asd")), anu.get(su.sanitize("dds"))) fxnGraph.add_edge(anu.get(su.sanitize("sas")), anu.get(su.sanitize("sada"))) fxnGraph.add_edge(anu.get(su.sanitize("sas")), anu.get(su.sanitize("asdx"))) gu.visualize_to_dot(fxnGraph, "test2.dot") gu.visualize_to_png("test2.dot", "test2.png") all_paths = gu.getAllPaths(fxnGraph, True) #all_paths=gu.getAllOptimizedPaths(fxnGraph,True) print("List size:", len(all_paths)) for sublist in all_paths: print(sublist) heu.getLabelEncoder(sublist)