Exemple #1
0
def run(cross=True, verbose=False, xml="../xml/RTE2_dev.xml", pre_processec_xml="../xml/RTE2_dev.preprocessed.xml"):
    learning_data="learningdata.tab" # the data features. extracted from an earlier run of features.
    filename = "results_part3.txt"
    clean_file(filename)
    if cross: features.run(xml, pre_processec_xml) # extracts the features
    data = orange.ExampleTable(learning_data)
    l = orange.BayesLearner(data)
    if cross:
        if verbose:
            print "result: ", validation(data)
            for item in data:
                if item.getclass() != l(item):
                    print '\033[1;41m'
                    print item, l(item),
                    print '\033[1;m'
                    print
                else:
                    print item, l(item)
        else:
            print "result: ", validation(data)
    else:
        file = open(filename, "a")
        file.write("ranked: no\n")
        if file:
            for item in data:
                s = str(item['id']) +" "+ str(l(item))
                file.write(s+"\n")
        else:
            print "Error opening file"
        file.close()
        print "finished writing to results_part3"

#run(True, False, "../xml/blind-test-data.xml") # runs the learning
#run(False) # runs the writing to results file.
Exemple #2
0
    if cfg.quick == True:
        g.run(cfg.name, quick=True)
    else:
        g.run(cfg.name)

if cfg.roc == True:
    import roccurves as rc
    if cfg.quick == True:
        rc.run(cfg.name + str(cfg.maxdepth), quick=True)
    else:
        rc.run(cfg.name + str(cfg.maxdepth))

if cfg.features == True:
    import features as f
    if cfg.quick == True:
        f.run(cfg.name + str(cfg.maxdepth), quick=True)
    else:
        f.run(cfg.name + str(cfg.maxdepth))

if cfg.checksignal == True:
    import checksignal as cs
    if cfg.quick == True:
        cs.run(cfg.name + str(cfg.maxdepth), quick=True)
    else:
        cs.run(cfg.name + str(cfg.maxdepth))

if cfg.crossvalidation == True:
    import crossvalidation as cv
    if cfg.quick == True:
        cv.run(cfg.name + str(cfg.maxdepth), quick=True)
    else:
Exemple #3
0
def single_run(data,
               split_rate,
               threshold=0,
               first_N=300,
               window_size=50,
               with_pca=False,
               n_components=20):

    ds = data['ds']
    us = data['us']
    random.shuffle(ds)
    random.shuffle(us)
    data['ds'] = ds
    data['us'] = us

    total_set = []
    total_label = []

    flag_f1 = False
    flag_f2 = False

    for i in range(1000):

        try:
            if len(data['ds'][i][1]) > threshold:
                if len(data['ds'][i][3]) > threshold:
                    total_set.append([
                        features.run("all", data['ds'][i][1], window_size,
                                     first_N),
                        features.run("all", data['ds'][i][3], window_size,
                                     first_N)
                    ])
                    total_label.append(1)
        except:
            flag_f1 = True

        try:
            if len(data['us'][i][1]) > threshold:
                if len(data['us'][i][3]) > threshold:
                    total_set.append([
                        features.run("all", data['us'][i][1], window_size,
                                     first_N),
                        features.run("all", data['us'][i][3], window_size,
                                     first_N)
                    ])
                    total_label.append(0)
        except:
            flag_f2 = True

        if flag_f1 and flag_f2:
            break

    #train and test split
    train = total_set[:int(split_rate * len(total_set))]
    train_label = total_label[:int(split_rate * len(total_set))]
    test = total_set[int(split_rate * len(total_set)):]
    test_label = total_label[int(split_rate * len(total_set)):]

    # flattens data automaticcaly for all data point
    train = np.reshape(
        np.asarray(train),
        (len(train), len(train[0]) * len(train[0][0]) * len(train[0][0][0])))
    test = np.reshape(
        np.asarray(test),
        (len(test), len(test[0]) * len(test[0][0]) * len(test[0][0][0])))

    if with_pca:
        pca = PCA(n_components=n_components, svd_solver="arpack")
        pca.fit(train)
        train = pca.transform(train)
        test = pca.transform(test)

    print("SVM training started...")
    svm = SVC(kernel='linear')
    svm.fit(np.asarray(train), np.asarray(train_label))

    print("SVM prediction started...")
    predictions = svm.predict(np.asarray(test))
    accuracy = accuracy_score(test_label, predictions)
    print(accuracy)
    print("Train split size: " + str(len(train)))
    print("Test split size: " + str(len(test)))
Exemple #4
0
def k_fold(data,
           K,
           threshold=0,
           first_N=300,
           window_size=50,
           with_pca=False,
           n_components=20):
    test_split_rate = 1 / K
    accuracies = []

    ds = data['ds']
    us = data['us']
    random.shuffle(ds)
    random.shuffle(us)
    data['ds'] = ds
    data['us'] = us

    total_set = []
    total_label = []

    flag_f1 = False
    flag_f2 = False

    for i in range(
            1000
    ):  #tries to add all data from all classes, if all flags are true, that means all data collected

        try:
            if len(data['ds'][i][1]) > threshold:
                if len(data['ds'][i][3]) > threshold:
                    total_set.append([
                        features.run("all", data['ds'][i][1], window_size,
                                     first_N),
                        features.run("all", data['ds'][i][3], window_size,
                                     first_N)
                    ])
                    total_label.append(1)
        except:
            flag_f1 = True

        try:
            if len(data['us'][i][1]) > threshold:
                if len(data['us'][i][3]) > threshold:
                    total_set.append([
                        features.run("all", data['us'][i][1], window_size,
                                     first_N),
                        features.run("all", data['us'][i][3], window_size,
                                     first_N)
                    ])
                    total_label.append(0)
        except:
            flag_f2 = True

        if flag_f1 and flag_f2:
            break

    for i in range(K):

        if i + 1 == K:  #when i is the last index, test split is not taking from middle, no need for 2 concatenation in train
            train = total_set[:int(i * test_split_rate * len(total_set))]
            train_label = total_label[:int(i * test_split_rate *
                                           len(total_set))]

            test = total_set[int(i * test_split_rate * len(total_set)):]
            test_label = total_label[int(i * test_split_rate *
                                         len(total_set)):]

        else:  #means that test split is taken from middle or from end of the list which creates no problem
            train = total_set[:int(i * test_split_rate * len(total_set))] + \
              total_set[int((i+1) * test_split_rate * len(total_set)):]

            train_label = total_label[:int(i * test_split_rate * len(total_set))] + \
                 total_label[int((i+1) * test_split_rate * len(total_set)):]

            test = total_set[int(i * test_split_rate *
                                 len(total_set)):int((i + 1) *
                                                     test_split_rate *
                                                     len(total_set))]
            test_label = total_label[int(i * test_split_rate *
                                         len(total_set)):int((i + 1) *
                                                             test_split_rate *
                                                             len(total_set))]

        #flattens data automaticcaly for all data point
        train = np.reshape(np.asarray(train),
                           (len(train), len(train[0]) * len(train[0][0]) *
                            len(train[0][0][0])))
        test = np.reshape(
            np.asarray(test),
            (len(test), len(test[0]) * len(test[0][0]) * len(test[0][0][0])))

        if with_pca:
            pca = PCA(n_components=n_components, svd_solver="arpack")
            pca.fit(train)
            train = pca.transform(train)
            test = pca.transform(test)

        print("SVM training started...")
        svm = SVC(kernel='linear')
        svm.fit(np.asarray(train), np.asarray(train_label))

        print("SVM prediction started...")
        predictions = svm.predict(np.asarray(test))
        accuracy = accuracy_score(test_label, predictions)
        accuracies.append(accuracy)

    print("Accuracies: ", end='')
    print(accuracies)
    print("Average accuracy: ", end='')
    print(sum(accuracies) / len(accuracies))
    print("Train split size: " + str(len(train)))
    print("Test split size: " + str(len(test)))

    return accuracies
Exemple #5
0
    if cfg.quick == True:
        g.run(cfg.name, quick = True)
    else:
        g.run(cfg.name)

if cfg.roc == True:
    import roccurves as rc
    if cfg.quick == True:
        rc.run(cfg.name + str(cfg.maxdepth), quick = True)
    else:
        rc.run(cfg.name + str(cfg.maxdepth))
    
if cfg.features == True:
    import features as f
    if cfg.quick == True:
        f.run(cfg.name + str(cfg.maxdepth), quick = True)
    else:
        f.run(cfg.name + str(cfg.maxdepth))
        
if cfg.checksignal == True:
    import checksignal as cs
    if cfg.quick == True:
        cs.run(cfg.name + str(cfg.maxdepth), quick = True)
    else:
        cs.run(cfg.name + str(cfg.maxdepth))

if cfg.crossvalidation == True:
    import crossvalidation as cv
    if cfg.quick == True:
        cv.run(cfg.name + str(cfg.maxdepth), quick = True)
    else: