Ejemplo n.º 1
0
Archivo: core.py Proyecto: itaicaspi/AI
def get_ad_dataset(noise=0.3):
    # Load ad dataset
    ad_dataset_file = 'ad-dataset/ad.data'
    ad_dataset = np.genfromtxt(ad_dataset_file, delimiter=',', dtype=str)
    ad_dataset[ad_dataset == 'ad.'] = 1
    ad_dataset[ad_dataset == 'nonad.'] = 0
    ads_features = get_ads_features(201239480, 302629605)
    ads_features += [np.shape(ad_dataset)[1]-1]
    ad_dataset = ad_dataset[:, ads_features].astype(int)
    ad_dataset = ad_dataset.tolist()

    return get_noisy_folds(ad_dataset, noise)
Ejemplo n.º 2
0
Archivo: core.py Proyecto: itaicaspi/AI
def get_har_dataset(noise=0.3):
    # Load HAR dataset
    har_dataset_file = 'UCI HAR Dataset/train/X_train.txt'
    har_labels_file = 'UCI HAR Dataset/train/y_train.txt'
    har_dataset = np.genfromtxt(har_dataset_file, dtype=float)
    har_labels = np.genfromtxt(har_labels_file, dtype=int)
    har_labels[har_labels <= 3] = 1     # replace moving labels
    har_labels[har_labels >= 4] = 0     # replace resting labels
    har_labels.shape = (-1, 1)
    har_dataset = np.concatenate((har_dataset, har_labels), axis=1)
    har_dataset = har_dataset.tolist()

    return get_noisy_folds(har_dataset, noise)
Ejemplo n.º 3
0
def check():
    featureList = get_ads_features(313542516, 208346379)
    file = open("ad.data", 'r')
    data = []
    results = []
    for line in file.readlines():
        out = line.split(",")
        item = [out[index] for index in featureList]
        item.append(str(0 if "nonad." in out[-1] else 1))
        data.append(item)
    file.close()
    noisyfolds, folds = get_noisy_folds(data)
    output = open("folds.pkl",'wb')
    output1 = open("noisyfolds.pkl",'wb')
    pickle.dump(folds,output)
    pickle.dump(noisyfolds,output1)
    output.close()
    output1.close()
    sumacc=0
    for i in range(0,10):
        train=[]
        for j in range(0,10):
            if i != j:
                train.extend(noisyfolds[j])
        trainFinal=[]
        results=[]
        for sample in train:
            trainFinal.append(sample[:-1])
            results.append(sample[-1])
        tree=DecisionTreeClassifier(criterion="entropy",splitter="best",min_samples_split=4)
        tree=tree.fit(trainFinal,results)
        predictSamples=[]
        predictResults=[]
        predictSamples = [folds[i][index][:-1] for index in range(len(folds[i]))]
        predictResults = [folds[i][index][-1] for index in range(len(folds[i]))]
        sumacc+=tree.score(predictSamples,predictResults)
    return sumacc/10
Ejemplo n.º 4
0
        current_accuracy /= float(len(fold_semi[k]))
        accuracy += current_accuracy
    accuracy /= float(len(noisy_fold_semi))
    print('committee semi-random: subset:{} | size: {} | acc: {}'.format(
            'examples' if is_subset_of_examples else 'features', committee_size, accuracy))


def all_semi_random_sub_examples(noisy_fold_semi, fold_semi, features):
    for size in sizes:
        calculate_semi_random_committee(noisy_fold_semi, fold_semi, features, size, True)


def all_semi_random_sub_features(noisy_fold_semi, fold_semi, features):
    for size in sizes:
        calculate_semi_random_committee(noisy_fold_semi, fold_semi, features, size, False)


if __name__ == '__main__':
    '''this part should be done once'''
    x, y = extract_data_from_ads()
    x_temp = copy.deepcopy(x)
    for i in range(0, len(x_temp)):
        x_temp[i].append(y[i])
    noisy_folds, folds = noise.get_noisy_folds(x_temp)
    '''end of part'''

    '''Arye'''
    all_semi_random_sub_features(noisy_folds, folds, [i for i in range(0, len(x[0]) - 1)])
    '''Max'''
    all_semi_random_sub_examples(noisy_folds, folds, [i for i in range(0, len(x[0]) - 1)])