Ejemplo n.º 1
0
Archivo: core.py Proyecto: itaicaspi/AI
def get_ad_dataset(noise=0.3):
    # Load ad dataset
    ad_dataset_file = 'ad-dataset/ad.data'
    ad_dataset = np.genfromtxt(ad_dataset_file, delimiter=',', dtype=str)
    ad_dataset[ad_dataset == 'ad.'] = 1
    ad_dataset[ad_dataset == 'nonad.'] = 0
    ads_features = get_ads_features(201239480, 302629605)
    ads_features += [np.shape(ad_dataset)[1]-1]
    ad_dataset = ad_dataset[:, ads_features].astype(int)
    ad_dataset = ad_dataset.tolist()

    return get_noisy_folds(ad_dataset, noise)
Ejemplo n.º 2
0
def extract_data_from_ads():
    file = open('ads/ad.data')

    indices = get_features.get_ads_features(317390805, 317390789)

    x_val = []
    y_val = []

    for line in file:
        list_line = line.split(',')

        sub_list_by_indices = []
        for i in indices:
            sub_list_by_indices.append(list_line[i])
        x_val.append(sub_list_by_indices)

        if current_sample_labeled_ad(list_line):
            y_val.append(1)
        else:
            y_val.append(0)

    return x_val, y_val
Ejemplo n.º 3
0
def check():
    featureList = get_ads_features(313542516, 208346379)
    file = open("ad.data", 'r')
    data = []
    results = []
    for line in file.readlines():
        out = line.split(",")
        item = [out[index] for index in featureList]
        item.append(str(0 if "nonad." in out[-1] else 1))
        data.append(item)
    file.close()
    noisyfolds, folds = get_noisy_folds(data)
    output = open("folds.pkl",'wb')
    output1 = open("noisyfolds.pkl",'wb')
    pickle.dump(folds,output)
    pickle.dump(noisyfolds,output1)
    output.close()
    output1.close()
    sumacc=0
    for i in range(0,10):
        train=[]
        for j in range(0,10):
            if i != j:
                train.extend(noisyfolds[j])
        trainFinal=[]
        results=[]
        for sample in train:
            trainFinal.append(sample[:-1])
            results.append(sample[-1])
        tree=DecisionTreeClassifier(criterion="entropy",splitter="best",min_samples_split=4)
        tree=tree.fit(trainFinal,results)
        predictSamples=[]
        predictResults=[]
        predictSamples = [folds[i][index][:-1] for index in range(len(folds[i]))]
        predictResults = [folds[i][index][-1] for index in range(len(folds[i]))]
        sumacc+=tree.score(predictSamples,predictResults)
    return sumacc/10