Exemple #1
0
def ada_boost_experiment():
    examples = ID3.data_parsing(FILE_PATH_TRAIN, numeric_cols)

    # hypothesis = AdaBoost.ada_boost(examples, 5, numeric_cols, missing_identifier)

    # print(hypothesis)

    iterations = 100

    hypothesis = AdaBoost.ada_boost(examples, iterations, numeric_cols,
                                    missing_identifier)

    ada_results_train = AdaBoost.test_ada_boost_hypothesis(
        hypothesis, FILE_PATH_TRAIN, numeric_cols, missing_identifier)
    ada_results_test = AdaBoost.test_ada_boost_hypothesis(
        hypothesis, FILE_PATH_TEST, numeric_cols, missing_identifier)

    # for t in range(iterations):
    #     print("AdaBoost Training Set - t:", t, "results:", ada_results_train[t],
    #           "{0:.2%}".format(1-ada_results_train[t][0]/ada_results_train[t][1]))
    # for t in range(iterations):
    #     print("AdaBoost Testing Set - t:", t, "results:", ada_results_test[t],
    #           "{0:.2%}".format(1-ada_results_test[t][0]/ada_results_test[t][1]))
    # for t in range(iterations):
    #     tree_results = ID3.test_tree(hypothesis[t][0],FILE_PATH_TRAIN, numeric_cols, missing_identifier)
    #     print("Decision Tree Training Set - t:", t, "results:", tree_results,
    #           "{0:.2%}".format(1 - tree_results[0] / tree_results[1]))
    # for t in range(iterations):
    #     tree_results = ID3.test_tree(hypothesis[t][0],FILE_PATH_TEST, numeric_cols, missing_identifier)
    #     print("Decision Tree Test Set - t:", t, "results:", tree_results,
    #           "{0:.2%}".format(1 - tree_results[0] / tree_results[1]))

    ada_train = []
    ada_test = []
    dec_train = []
    dec_test = []

    for t in range(iterations):
        ada_train.append(1 - ada_results_train[t][0] / ada_results_train[t][1])
        ada_test.append(1 - ada_results_test[t][0] / ada_results_test[t][1])
        tree_results = ID3.test_tree(hypothesis[t][0], FILE_PATH_TRAIN,
                                     numeric_cols, missing_identifier)
        dec_train.append(1 - tree_results[0] / tree_results[1])
        tree_results = ID3.test_tree(hypothesis[t][0], FILE_PATH_TEST,
                                     numeric_cols, missing_identifier)
        dec_test.append(1 - tree_results[0] / tree_results[1])

    ada_graph = [
        tuple([ada_train, "AdaBoost Train"]),
        tuple([ada_test, "AdaBoost Test"])
    ]
    GraphUtility.graph(ada_graph, "AdaBoost Data", "Iterations", "Error")

    tree_graph = [
        tuple([dec_train, "Tree Train"]),
        tuple([dec_test, "Tree Test"])
    ]
    GraphUtility.graph(tree_graph, "Decision Tree Data", "Iterations", "Error")
Exemple #2
0
def lms_experiment():

    file_path_train = "/home/john/PycharmProjects/u1201441_Private_Repository/CS6350_Files/HW2/concrete/train.csv"
    file_path_test = "/home/john/PycharmProjects/u1201441_Private_Repository/CS6350_Files/HW2/concrete/test.csv"

    batch_size = -1  # -1 for full batch descent, or a batch size for stochastic descent.
    iterations = 1000
    learning_constant = 1

    plot_iter = 1000

    results = []

    for t in range(plot_iter):
        hypothesis = LeastMeanSquares.least_mean_squares(
            file_path_train, batch_size, iterations, learning_constant)
        if hypothesis[2]:
            break
        learning_constant = learning_constant / 2

    r = hypothesis[0]
    hypotheses = hypothesis[1]

    for hyp in hypotheses:
        results.append(LeastMeanSquares.test_lms(hyp, file_path_test))

    print("Gradient Descent - r:", r, "Weight:", hypotheses[-1], "Losses:",
          results)

    batch_size = 1  # -1 for full batch descent, or a small batch size for stochastic descent.
    results_stoch = []

    for t in range(plot_iter):
        hypothesis = LeastMeanSquares.least_mean_squares(
            file_path_train, batch_size, iterations, learning_constant)
        if hypothesis[2]:
            break
        learning_constant = learning_constant / 2

    r = hypothesis[0]
    hypotheses = hypothesis[1]

    for hyp in hypotheses:
        results_stoch.append(LeastMeanSquares.test_lms(hyp, file_path_test))

    print("Stochastic Gradient Descent - r:", r, "Weight:", hypotheses[-1],
          "Losses:", results_stoch)

    lms_graph = [
        tuple([results, "Descent"]),
        tuple([results_stoch, "Stochastic Descent"])
    ]

    GraphUtility.graph(lms_graph, "LMS_Test", "Loss",
                       "Gradient Descent Iterations")
def DegreeClassify(dataPath='../LinkAnalyticsData/UTK_problem/'):
    """ Finds the distrbtion of the cummulative node and edge wights """
    filenames=('Moria_2','Standelf_2')
    attrs=('calls','texts','degree','secs')
    for f in filenames:
        # Reading in the Graph
        MG = GU.readData(os.path.join(dataPath,f)+'.graph')
        deg = nx.degree(MG)
def GetEdgeDistributions(dataPath='../LinkAnalyticsData/UTK_problem/'):
    filenames=('Moria_1.graph','Standelf_1.graph')
    attrs=('calls','texts','days','secs')
    for f in filenames:
        # Reading in the Graph
        MG = GU.readData(os.path.join(dataPath,f))
        # Distribution of each Attribute
        for attr in attrs:
            data = GU.GetAttr(MG,attr)
            # Plotting
            pyplot.hist(data,100)
            pyplot.yscale('log')
            pyplot.grid(True)
            pyplot.ylabel("Frequency")
            pyplot.xlabel(attr)
            name = f.split('.')[0].split('_')[0]
            title = name+" "+attr+" distribution"
            pyplot.title(title)
            pyplot.savefig(name+"_"+attr+"_distribution.png")
            pyplot.clf()
def readData(filename='../LinkAnalyticsData/UTK_problem/Moria_1.graph'):
    """ Creates a dataset for ANN training of the formated data supplied by filename """
    """ Currently based on 4x2 inputs of days, calls, call duration, and texts """
    """ Two 'classes' are implemented, either there or not """
    numInputs = 2+4+2
    alldata = ClassificationDataSet(numInputs,1,nb_classes=2)
    MG = GU.readData(filename)
    closeness = nx.closeness_centrality(MG)
    degree = nx.degree(MG)
   
    startTime = datetime.now()
    # Computing the data
    data = [[closeness[u],degree[u],\
            edata['calls'],edata['secs'],edata['texts'],edata['days'],\
            degree[v],closeness[v]] \
            for u,v,edata in MG.edges(data=True)]
    for d in data:
        alldata.addSample(d,[1])

    print "Converted to data in ",(datetime.now()-startTime)
    return alldata
def GetDataDistributions(dataPath='../LinkAnalyticsData/UTK_problem/'):
    filenames=('Moria_1.graph','Standelf_1.graph')
    """ Finds the distrbtion of the cummulative node and edge wights """
    attrs=('calls','texts','degree','secs')
    for f in filenames:
        # Reading in the Graph
        MG = GU.readData(os.path.join(dataPath,f))
        g = GU.ConvertToSingle(MG)
        for attr in attrs:
            x = list()
            for n in g.nodes():
                x.append(g.node[n][attr])

            # Plotting the Data
            largest = heapq.nlargest(3,x)
            pyplot.figure()
            pyplot.hist(x,bins=np.logspace(1,np.log2(largest[2]),25,base=2))
            pyplot.ylabel("Frequency")
            pyplot.xlabel(attr)
            name = f.split('.')[0].split('_')[0]
            title = name+" "+attr+" distribution"
            pyplot.title(title)
            pyplot.savefig(name+"_"+attr+"_cum_distribution.png")
def analyze(dirname,directory):
    start = TimeUtility.start()
    anu=an.Abbreviations()
    rootNode = Node(dirname)
    file_paths = []                                                        # List which will store all of the full filepaths.
    fileFxnDictionary = {}
    fileImportDictionary = {}
    fileClassDictionary = {}
    callerFxnArgumentsDictionary={}
    callerCalleeFxn={}
    calleFxnArguments={}
    fileFxnCount={}
    fileImportCount={}
    fileClassCount={}
    uniqueImports=[]
    callerCalleePath=[]
    fxnGraph = nx.DiGraph()
    fxnGraphFull=nx.DiGraph()

    fxnList=[]
    classList=[]
    importList=[]

    for root, directories, files in os.walk(directory):                    # Walk the tree.


        for filename in files:
            str=filename.__str__()

            #print("root",root)
            filepath=root.replace("\\","/")                               # Join the two strings in order to form the full filepath.
            filepath=filepath+"/"+filename
            #filepath = os.path.join(root, filename)
            file_paths.append(filepath)  # Add it to the list.
            #module = importlib.import_module(filepath)
            #my_class = getattr(module, 'MyClass')
            #my_instance = my_class()
            #dir()
#           #print("filepath : ",filepath," members are :",dir(module(filepath)))
            #print("filepath : ", filepath)
            if(not filepath.endswith(".py")):
                continue
            localFileNode = Node(filepath.replace(directory,""), parent=rootNode)
            file = open(filepath, "r+",encoding="utf8")
            variable=[]
            functionName=[]
            className=[]
            importModules=[]
            isNextWordfxn=False
            isNextWordclass=False
            isNextWordImport=False
            isNewWord=False
            classChildren=False
            fxnChildren=False
            mainFxnNode=None
            clsNode=None

            for word in file.read().split():
                if(stopWordsRemoval.isStopWord(word) or word.__len__()<=2):
                    continue
                if(isNextWordclass):
                 if(word.__contains__("(")) :
                   className.append(word)
                   classChildren=True
                   clsNode = Node("class:" + word, parent=localFileNode)
                 isNextWordclass=False
                 #print(word)
                elif(isNextWordfxn):
                 print("fxnname",word)
                 if("(" in word):
                     arg=word.split("(")[1]
                     word=word.split("(")[0]

                 functionName.append(word.lower())
                 isNextWordfxn=False

                 if(classChildren and (not fxnChildren)):
                  mainFxnNode = Node("Fxn:" + word, parent=clsNode)
                 elif(fxnChildren):
                  fxnNode = Node("Fxn:" + word, parent=mainFxnNode)
                  callerCalleeFxn.update({mainFxnNode.name,fxnChildren})
                 else:
                  fxnNode = Node("Fxn:" + word, parent=localFileNode)
                 fxnChildren = True
                 #print(word)
                elif(isNextWordImport):
                 importModules.append(word)
                 isNextWordImport=False
                 importNode = Node("Import:" + word, parent=localFileNode)
                 #print(word)

                if (word == "def"):
                    isNextWordfxn = True
                    isNewWord=True
                    fxnChildren = False
                    #print("true")
                elif (word == "class"):
                    isNextWordclass = True
                    isNewWord=True
                    fxnChildren=False
                elif (word == "import"):
                    isNextWordImport = True
                    isNewWord=True
                elif (checkWordForValidFunction(word) and fxnChildren):
                    print("got new function lets see::::",word)
                    if(fxnChildren and mainFxnNode is not None):
                        fxnNode = Node("Fxn:" + word, parent=mainFxnNode)
                        callerCalleeFxn[mainFxnNode.name]=[word]
                        fxnGraph.add_edge(anu.get(su.sanitize(mainFxnNode.name)),anu.get(su.sanitize(word)))
                        fxnGraphFull.add_edge(su.sanitize(mainFxnNode.name),su.sanitize(word))


           # print("File:",filepath,"Functions:",functionName)
            #print("File:",filepath,"Classes:", className)
            #print("File:",filepath,"Import:", importModules)
            if(len(functionName) != 0):
                fileFxnDictionary.update({filepath.replace(directory,""):set(functionName)})
                fileFxnCount[filepath.replace(directory,"")]=len(set(functionName))
            if (len(className) != 0):
                fileClassDictionary.update({filepath.replace(directory,""): set(className)})
                fileClassCount[filepath.replace(directory, "")]=len(set(className))
            if (len(importModules) != 0):
                fileImportDictionary.update({filepath.replace(directory,""): set(importModules)})
                fileImportCount[filepath.replace(directory, "")]= len(set(importModules))
                uniqueImports.append(importModules)

    #print(len(fileFxnDictionary.values()))
    workbook = xlsxwriter.Workbook(dirname+"data"+".xlsx")
    workbook1 = xlsxwriter.Workbook(dirname+"function"+".xlsx")
    workbook2=xlsxwriter.Workbook(dirname+"count"+".xlsx")

    ExcelUtility.writeToExcel(callerCalleeFxn, "CallerCalleFxn", workbook1)
    ExcelUtility.writeToExcel(anu.shortNames, "FxnAbbre.", workbook1)
    ExcelUtility.writeToExcel(fileFxnDictionary,"functionInfo",workbook)
    ExcelUtility.writeToExcel(fileClassDictionary, "classInfo",workbook)
    ExcelUtility.writeToExcel(fileImportDictionary, "importInfo",workbook)
    ExcelUtility.writeToExcelCount(fileFxnCount, "fxncount", workbook2)
    ExcelUtility.writeToExcelCount(fileImportCount, "importcount", workbook2)
    ExcelUtility.writeToExcelCount(fileClassCount, "classcount", workbook2)
    dumpclean(callerCalleeFxn)
    print("tree:")
    PrintUtility.printTree(rootNode)
    print("Unique Imports are:",len(uniqueImports))

    gu.getAllPaths(fxnGraph,True,dirname)
    gu.getAllPathsWithoutAbbreviations(fxnGraphFull,True,dirname)
    #visualize tha paths and get all the optimized paths of nodes that is this
    # function is calling this functioon and further on.. path1: f1 f2 f3 f4 f5 f6 f7
    # the results will be saved in excel file as
    filename=dirname+'pathsoptimized.xlsx'
    all_paths=gu.getAllOptimizedPaths(fxnGraph, True,filename,dirname)
    gu.getAllOptimizedPathsWithoutAbbreviations(fxnGraphFull,True,dirname+"pathsoptimizedNoAbbre.xlsx",dirname)
    print("anu.counter:",anu.counter)
    callerMatrix=ceu.encodeValues(all_paths,anu.counter,dirname)
    pathMatrix=cs.getResultSimilarityMatrix(callerMatrix,dirname+"similarity.csv")
    cluster.test(dirname,pathMatrix)
    TimeUtility.end(start)
    #caller=callerMatrix()

    #caller.cleanInput(callerCalleeFxn)
    #stopWordsRemoval.removeStopwords('dataold.xlsx')
    #DeterMinePaths.determine(callerCalleeFxn)

    PrintUtility.printTree(rootNode)
    return rootNode,pathMatrix
Exemple #8
0
def credit_experiment():

    file_path = "/home/john/PycharmProjects/u1201441_Private_Repository/CS6350_Files/HW2/credit/default of credit card clients.csv"

    numeric_cols = [0, 4, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
    missing_identifier = None
    training_data = []
    test_data = []

    data = ID3.data_parsing(file_path, numeric_cols)

    LABEL_INDEX = len(data[0]) - 2

    for instance in data:
        if instance[LABEL_INDEX] == '1':
            instance[LABEL_INDEX] = "yes"
        else:
            instance[LABEL_INDEX] = "no"

    test_indices = random.sample(range(len(data)), len(data))
    for i in test_indices:
        if i < 6000:
            test_data.append(data[i])
        else:
            training_data.append(data[i])

    iterations = 100

    decision_tree = ID3.build_decision_tree(
        training_data,
        max_depth=-1,
        info_gain_type=1,
        numeric_cols=numeric_cols,
        missing_identifier=missing_identifier)
    adaboost = AdaBoost.ada_boost(training_data,
                                  iterations=iterations,
                                  numeric_cols=numeric_cols,
                                  missing_identifier=missing_identifier)
    bagged_tree = BaggedTrees.bagged_trees(
        training_data,
        iterations=iterations,
        sample_size=100,
        numeric_cols=numeric_cols,
        missing_identifier=missing_identifier)
    forest = RandomForest.random_forest(training_data,
                                        iterations=iterations,
                                        sample_size=100,
                                        numeric_cols=numeric_cols,
                                        missing_identifier=missing_identifier,
                                        feature_size=4)

    # Decision Tree results

    tree_results = ID3.test_tree(decision_tree, training_data, numeric_cols,
                                 missing_identifier)
    tree_train = 1 - tree_results[0] / tree_results[1]
    tree_results = ID3.test_tree(decision_tree, test_data, numeric_cols,
                                 missing_identifier)
    tree_test = 1 - tree_results[0] / tree_results[1]

    tree_train_ln = []
    tree_test_ln = []

    for t in range(iterations):
        tree_train_ln.append(tree_train)
        tree_test_ln.append(tree_test)

    # AdaBoost results
    ada_results_train = AdaBoost.test_ada_boost_hypothesis(
        adaboost, training_data, numeric_cols, missing_identifier)
    ada_results_test = AdaBoost.test_ada_boost_hypothesis(
        adaboost, test_data, numeric_cols, missing_identifier)

    ada_train = []
    ada_test = []

    for t in range(iterations):
        ada_train.append(1 - ada_results_train[t][0] / ada_results_train[t][1])
        ada_test.append(1 - ada_results_test[t][0] / ada_results_test[t][1])

    ada_graph = [
        tuple([ada_train, "AdaBoost Train"]),
        tuple([ada_test, "AdaBoost Test"]),
        tuple([tree_train_ln, "Tree Train"]),
        tuple([tree_test_ln, "Tree Test"])
    ]

    GraphUtility.graph(ada_graph, "AdaBoost Data", "Iterations", "Error")

    # Bagging results
    results_train = BaggedTrees.test_bagged_tree_hypothesis(
        bagged_tree, training_data, numeric_cols, missing_identifier)
    results_test = BaggedTrees.test_bagged_tree_hypothesis(
        bagged_tree, test_data, numeric_cols, missing_identifier)

    # Charts
    bag_train = []
    bag_test = []

    for t in range(iterations):
        bag_train.append(1 - results_train[t][0] / results_train[t][1])
        bag_test.append(1 - results_test[t][0] / results_test[t][1])

    bag_graph = [
        tuple([bag_train, "Bagging Train"]),
        tuple([bag_test, "Bagging Test"]),
        tuple([tree_train_ln, "Tree Train"]),
        tuple([tree_test_ln, "Tree Test"])
    ]
    GraphUtility.graph(bag_graph, "Bagged Tree Data", "Num Trees", "Error")

    # Forest Results
    results_train = RandomForest.test_random_forest_hypothesis(
        forest, training_data, numeric_cols, missing_identifier)
    results_test = RandomForest.test_random_forest_hypothesis(
        forest, test_data, numeric_cols, missing_identifier)

    # Charts
    forest_train = []
    forest_test = []
    for t in range(iterations):
        forest_train.append(1 - results_train[t][0] / results_train[t][1])
        forest_test.append(1 - results_test[t][0] / results_test[t][1])

    forest_graph = [
        tuple([forest_train, "Forest Train - " + str(2) + " features"]),
        tuple([forest_test, "Forest Test - " + str(2) + " features"]),
        tuple([tree_train_ln, "Tree Train"]),
        tuple([tree_test_ln, "Tree Test"])
    ]
    GraphUtility.graph(forest_graph, "Random Forest Data", "Num Trees",
                       "Error")
Exemple #9
0
def random_forest_experiment():
    examples = ID3.data_parsing(FILE_PATH_TRAIN, numeric_cols)

    # hypothesis = AdaBoost.ada_boost(examples, 5, numeric_cols, missing_identifier)

    # print(hypothesis)

    LABEL_INDEX = len(examples[0]) - 2

    iterations = 100
    sample_size = int(len(examples) / 4)

    for feature_size in [2, 4, 6]:
        hypothesis = RandomForest.random_forest(examples, iterations,
                                                sample_size, numeric_cols,
                                                missing_identifier,
                                                feature_size)
        results_train = RandomForest.test_random_forest_hypothesis(
            hypothesis, FILE_PATH_TRAIN, numeric_cols, missing_identifier)
        results_test = RandomForest.test_random_forest_hypothesis(
            hypothesis, FILE_PATH_TEST, numeric_cols, missing_identifier)

        # Charts
        forest_train = []
        forest_test = []
        for t in range(iterations):
            # print("Random Forest -", "Feature Size:", feature_size, "t:", t, "results:", results[t], results[t][0]/results[t][1])
            forest_train.append(1 - results_train[t][0] / results_train[t][1])
            forest_test.append(1 - results_test[t][0] / results_test[t][1])

        forest_graph = [
            tuple([
                forest_train,
                "Forest Train - " + str(feature_size) + " features"
            ]),
            tuple([
                forest_test, "Forest Test - " + str(feature_size) + " features"
            ])
        ]
        GraphUtility.graph(forest_graph, "Random Forest Data", "Num Trees",
                           "Error")

    # Bias/Variance
    iterations = 100
    forest = []
    trees = []
    sample_size = 100
    feature_size = 2

    for t in range(iterations):
        data = random.sample(examples, 1000)
        forest.append(
            RandomForest.random_forest(data, iterations, sample_size,
                                       numeric_cols, missing_identifier,
                                       feature_size))
        trees.append(forest[t][0][0])

    # Bias/Variance of individual trees.
    labels = []
    biases = []
    variances = []
    for instance in examples:
        for tree in trees:
            label = ID3.get_label(tree, instance, numeric_cols,
                                  missing_identifier)
            labels.append(1 if label == "yes" else -1)

        if instance[LABEL_INDEX] == "yes":
            true_label = 1
        else:
            true_label = -1

        avg = numpy.average(labels)
        biases.append((avg - true_label)**2)
        variances.append(numpy.var(labels))

    tree_bias = numpy.average(biases)
    tree_variance = numpy.average(variances)

    # Bias/Variance of bagged trees.
    labels = []
    biases = []
    variances = []
    for instance in examples:
        for tree in forest:
            label = RandomForest.get_label(tree, instance, numeric_cols,
                                           missing_identifier)
            labels.append(1 if label == "yes" else -1)

        true_label = 1 if instance[LABEL_INDEX] == "yes" else -1

        avg = numpy.average(labels)
        biases.append((avg - true_label)**2)
        variances.append(numpy.var(labels))

    forest_bias = numpy.average(biases)
    forest_variance = numpy.average(variances)

    print("Tree Bias:", "{0:.3}".format(tree_bias), "Tree Variance:",
          "{0:.3}".format(tree_variance))
    print("Forest Bias:", "{0:.3}".format(forest_bias), "Forest Variance:",
          "{0:.3}".format(forest_variance))
Exemple #10
0
def bagged_trees_experiment():
    examples = ID3.data_parsing(FILE_PATH_TRAIN, numeric_cols)

    # hypothesis = AdaBoost.ada_boost(examples, 5, numeric_cols, missing_identifier)

    # print(hypothesis)

    LABEL_INDEX = len(examples[0]) - 2

    iterations = 100
    sample_size = int(len(examples) / 2)

    hypothesis = BaggedTrees.bagged_trees(examples, iterations, sample_size,
                                          numeric_cols, missing_identifier)
    results_train = BaggedTrees.test_bagged_tree_hypothesis(
        hypothesis, FILE_PATH_TRAIN, numeric_cols, missing_identifier)
    results_test = BaggedTrees.test_bagged_tree_hypothesis(
        hypothesis, FILE_PATH_TEST, numeric_cols, missing_identifier)
    # for t in range(iterations):
    #     print("Bagged Tree Training Set - t:", t, "results:", results_train[t],
    #           "{0:.2%}".format(1-results_train[t][0]/results_train[t][1]))
    # for t in range(iterations):
    #     print("Bagged Tree Test Set - t:", t, "results:", results_test[t],
    #           "{0:.2%}".format(1-results_test[t][0]/results_test[t][1]))

    # Charts
    bag_train = []
    bag_test = []

    for t in range(iterations):
        bag_train.append(1 - results_train[t][0] / results_train[t][1])
        bag_test.append(1 - results_test[t][0] / results_test[t][1])

    bag_graph = [
        tuple([bag_train, "Bagging Train"]),
        tuple([bag_test, "Bagging Test"])
    ]
    GraphUtility.graph(bag_graph, "Bagged Tree Data", "Num Trees", "Error")

    # Bias/Variance calculations.
    iterations = 100
    bagged_trees = []
    trees = []
    sample_size = 100

    for t in range(iterations):
        data = random.sample(examples, 1000)
        bagged_trees.append(
            BaggedTrees.bagged_trees(data, iterations, sample_size,
                                     numeric_cols, missing_identifier))
        trees.append(bagged_trees[t][0][0])

    # Bias/Variance of individual trees.
    labels = []
    biases = []
    variances = []
    for instance in examples:
        for tree in trees:
            label = ID3.get_label(tree, instance, numeric_cols,
                                  missing_identifier)
            labels.append(1 if label == "yes" else -1)

        if instance[LABEL_INDEX] == "yes":
            true_label = 1
        else:
            true_label = -1

        avg = numpy.average(labels)
        biases.append((avg - true_label)**2)
        variances.append(numpy.var(labels))

    tree_bias = numpy.average(biases)
    tree_variance = numpy.average(variances)

    # Bias/Variance of bagged trees.
    labels = []
    biases = []
    variances = []
    for instance in examples:
        for tree in bagged_trees:
            label = BaggedTrees.get_label(tree, instance, numeric_cols,
                                          missing_identifier)
            labels.append(1 if label == "yes" else -1)

        true_label = 1 if instance[LABEL_INDEX] == "yes" else -1

        avg = numpy.average(labels)
        biases.append((avg - true_label)**2)
        variances.append(numpy.var(labels))

    bag_bias = numpy.average(biases)
    bag_variance = numpy.average(variances)

    print("Tree Bias:", "{0:.3}".format(tree_bias), "Tree Variance:",
          "{0:.3}".format(tree_variance))
    print("Bagged Bias:", "{0:.3}".format(bag_bias), "Bagged Variance:",
          "{0:.3}".format(bag_variance))
Exemple #11
0
import networkx as nx
import StringUtility as su
import AbbreviatedNamesUtility as AN
import GraphUtility as gu
import OneHotEncodingUtility as heu

fxnGraph = nx.DiGraph()
anu = AN.Abbreviations()
fxnGraph.add_edge(anu.get(su.sanitize("abc")), anu.get(su.sanitize("def")))
fxnGraph.add_edge(anu.get(su.sanitize("def")), anu.get(su.sanitize("ghj")))

fxnGraph.add_edge(anu.get(su.sanitize("asd")), anu.get(su.sanitize("dds")))

fxnGraph.add_edge(anu.get(su.sanitize("sas")), anu.get(su.sanitize("sada")))

fxnGraph.add_edge(anu.get(su.sanitize("sas")), anu.get(su.sanitize("asdx")))

gu.visualize_to_dot(fxnGraph, "test2.dot")

gu.visualize_to_png("test2.dot", "test2.png")

all_paths = gu.getAllPaths(fxnGraph, True)
#all_paths=gu.getAllOptimizedPaths(fxnGraph,True)

print("List size:", len(all_paths))
for sublist in all_paths:
    print(sublist)
    heu.getLabelEncoder(sublist)