コード例 #1
0
ファイル: main.py プロジェクト: yinliu13/MarketBasket
def run():
    embedding_wrapper = EmbeddingWrapper('product')
    bc = BasketConstructor('./data/', './data/')
    ub_basket = bc.get_baskets('prior', reconstruct=False)
    ok, ub_basket = train_test_split(ub_basket, test_size=0.20, random_state=0)
    #embedding_wrapper = EmbeddingWrapper('tafeng_products')
    print(ub_basket)

    all_baskets = ub_basket.basket.values
    print(all_baskets)
    #changes every item to string
    print("nested change")
    all_baskets = nested_change(list(all_baskets), str)
    print("embedding_wrapper.remove_products_wo_embeddings(all_baskets)")
    all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets)
    print("uncommon products")
    all_baskets = remove_products_which_are_uncommon(all_baskets)
    print("short baskets")
    medium_baskets, all_baskets = remove_short_baskets(all_baskets)
    print(medium_baskets , all_baskets)
    print("nested change")
    all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f)
    print("split_data")
    train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data(all_baskets)
    print('knndtw')
    knndtw = KnnDtw(n_neighbors=[5])
    preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, 
                                          embedding_wrapper.basket_dist_REMD)
    print(preds_all)
    print(distances)
    #print("Wasserstein distance", sum(distances)/len(distances))
    return preds_all, distances
コード例 #2
0
def machine_learning(x_test_final, x_train_final, y_train_final, labels,
                     param_weights, para_weights, param_list, k_value):
    param_labels = []
    for i in range(0, len(x_train_final)):
        #Analyze dataset

        x_train_final2 = np.array(x_train_final[i])
        y_train_final2 = np.array(y_train_final[i])
        x_test_final2 = np.array(x_test_final[i])

        m = KnnDtw(n_neighbors=k_value, max_warping_window=100)
        m.fit(x_train_final2, y_train_final2)
        label, proba = m.predict(x_test_final2)
        #get the weight for this parameter
        if param_weights == None:
            param_labels.append(label)  #if we don't have weights do this
        else:
            weight = [para_weights[param_list[i]]]
            param_labels.append(list(zip(
                label, weight * len(label))))  #a tuple list of (label, weight)

    param_labels = np.array(param_labels)
    if param_weights == None:
        para_mode, para_count = stats.mode(param_labels)
        para_mode = np.reshape(para_mode, (para_mode.shape[1], ))
    else:  #for weights
        para_mode = [0] * param_labels.shape[1]
        for i in range(param_labels.shape[1]):
            mode_count = [0] * len(
                labels
            )  #an array representing how frequent each label was used to classify a time series
            col = param_labels[:, i]
            for p in col:
                mode_count[p[0] - 1] += p[1]
            para_mode[i] = mode_count.index(
                max(mode_count)
            ) + 1  #the the label that was used most frequently as the overall label
            #para_mode = np.reshape(para_mode,(para_mode.shape[1],))

    #Using mode to see which classification was the most frequent for each data from all parameters used
    #k_val = list(range(1,11))
    #k_fold_cross_val(k_val,x_train,y_train,6)
    return param_labels, para_mode
コード例 #3
0
ファイル: main.py プロジェクト: MathiasKraus/MarketBasket
def run():
    embedding_wrapper = EmbeddingWrapper('product')
    bc = BasketConstructor('./data/', './data/')
    ub_basket = bc.get_baskets('prior', reconstruct=False)

    all_baskets = ub_basket.basket.values
    all_baskets = nested_change(list(all_baskets), str)

    all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets)
    all_baskets = remove_products_which_are_uncommon(all_baskets)
    all_baskets = remove_short_baskets(all_baskets)
    all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f)

    train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data(
        all_baskets)

    knndtw = KnnDtw(n_neighbors=[5])
    preds_all, distances = knndtw.predict(train_ub, val_ub_input,
                                          embedding_wrapper.basket_dist_EMD,
                                          embedding_wrapper.basket_dist_REMD)
    return preds_all, distances
コード例 #4
0
def param_ranking(param_list, k_val, warp_val, datapath, avg_type):
    start_time = time.time()
    p = []
    r = []
    f = []
    for dataparam in param_list:
        trainingdatafile = datapath + 'train_' + dataparam + '.txt'
        traininglabelfile = datapath + 'train_labels.txt'

        testdatafile = datapath + 'test_' + dataparam + '.txt'
        testlabelfile = datapath + 'test_labels.txt'

        # Open training data file, x:data, y:label
        x_train_file = open(trainingdatafile, 'r')
        y_train_file = open(traininglabelfile, 'r')

        #Open test data file, x:data, y:label
        x_test_file = open(testdatafile, 'r')
        y_test_file = open(testlabelfile, 'r')

        # Create empty lists
        x_train = []
        y_train = []
        x_test = []
        y_test = []

        # Mapping table for classes
        labels = {
            1: 'Hover',
            2: 'Impact (Front Left)',
            3: 'Impact (Front Right)',
            4: 'Impact (Back Left)',
            5: 'Impact (Back Right)',
            6: 'Gust (from Left)',
            7: 'Gust (from Right)',
            8: 'Gust (from front)'
        }

        i = 0
        # Loop through datasets
        for x in x_train_file:
            x_train.append([float(ts) for ts in x.split()])
        for y in y_train_file:
            y_train.append(int(y.rstrip('\n')))

        for x in x_test_file:
            x_test.append([float(ts) for ts in x.split()])

        for y in y_test_file:
            y_test.append(int(y.rstrip('\n')))

        #close data files
        x_train_file.close()
        y_train_file.close()
        x_test_file.close()
        y_test_file.close()

        # Convert to numpy for efficiency

        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        m = KnnDtw(n_neighbors=k_val, max_warping_window=warp_val)
        m.fit(x_train, y_train)
        label, proba = m.predict(x_test)

        precision, recall, f_score, _ = score(y_test, label, average=avg_type)
        p.append(precision)
        r.append(recall)
        f.append(f_score)

    precision_rank = sorted(list(zip(param_list, p)), key=lambda x: x[1])
    recall_rank = sorted(list(zip(param_list, r)), key=lambda x: x[1])
    fscore_rank = sorted(list(zip(param_list, f)), key=lambda x: x[1])
    #("Parameter rank by precision is:",precision_rank)
    print('Ranking for k = %s, max warping window = %s' % (k_val, warp_val))
    for rank in precision_rank[::-1]:
        print(rank[0], ": ", rank[1])
    #print("Parameter rank by recall is:",recall_rank)
    #print("Parameter rank by f-score is:",fscore_rank)
    print("--- %s seconds ---" %
          (time.time() - start_time))  #let's see how long this takes...
コード例 #5
0
ファイル: Learn.py プロジェクト: CS501ProSCE/DroneProject
if(plotdata):
    plt.figure(figsize=(11,7))
    colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD',
              '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27']
    for i, r in enumerate([0,1,2,3,4,5,6,7,8,9]):
        plt.subplot(5,2,i+1)
        plt.plot(x_test[r], label=labels[y_test[r]], color=colors[i], linewidth=2)
        plt.xlabel('Samples @50Hz')
        plt.legend(loc='upper left')
        plt.tight_layout()
        

#Analyze dataset
m = KnnDtw(n_neighbors, max_warping_window)
m.fit(x_train,y_train)
label, proba = m.predict(x_test)

#Classification report
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(label, y_test,
                            target_names=[l for l in labels.values()]))


#Confusion Matrix
conf_mat = confusion_matrix(label, y_test)

fig = plt.figure(figsize=(8,8))
width = np.shape(conf_mat)[1]
height = np.shape(conf_mat)[0]

res = plt.imshow(np.array(conf_mat), cmap=plt.cm.summer, interpolation='nearest')
コード例 #6
0
 print("Searching for KNN-DTW...")
 print(" ")
 with open(path, "r") as f:
     reader = csv.DictReader(f)
     freq_list = []
     mass_list = []
     for row in reader:
         freq = float(row['Freq'])
         freq_list.append(freq)
         mass = float(row['Mass'])
         mass_list.append(mass)
 freq_list = [freq_list]
 mass_list = [mass_list]
 freq_list = np.array(freq_list)
 mass_list = np.array(mass_list)
 _, _, freq_knn = freq_model.predict(freq_list)
 _, _, mass_knn = mass_model.predict(mass_list)
 freq_knn = freq_knn[0]
 mass_knn = mass_knn[0]
 print("Freq:", freq_knn)
 print("Mass:", mass_knn)
 print(" ")
 freq_probability = {}
 freq_probability['plastic'] = int((freq_knn == 0).sum()) / 10
 freq_probability['metal'] = int((freq_knn == 1).sum()) / 10
 freq_probability['glass'] = int((freq_knn == 2).sum()) / 10
 #freq_probability['paper'] = int((freq_knn == 3).sum())/10
 mass_probability = {}
 mass_probability['plastic'] = int((mass_knn == 0).sum()) / 10
 mass_probability['metal'] = int((mass_knn == 1).sum()) / 10
 mass_probability['glass'] = int((mass_knn == 2).sum()) / 10
コード例 #7
0
def multi_param_learn(param_list, param_weights, k_value, datapath):

    start_time = time.time()
    #this is the list that will store labels returned from each param before aggregating
    param_labels = []
    if param_weights != None:
        if len(param_list) != len(param_weights):
            raise Exception(
                'When using weights, there must one weight for each parameter!'
            )
        para_weights = dict(zip(param_list, param_weights))

    para_k = dict(zip(param_list, k_value))

    for dataparam in param_list:

        trainingdatafile = datapath + 'train_' + dataparam + '.txt'
        traininglabelfile = datapath + 'train_labels.txt'

        testdatafile = datapath + 'test_' + dataparam + '.txt'
        testlabelfile = datapath + 'test_labels.txt'

        # Open training data file, x:data, y:label
        x_train_file = open(trainingdatafile, 'r')
        y_train_file = open(traininglabelfile, 'r')

        #Open test data file, x:data, y:label
        x_test_file = open(testdatafile, 'r')
        y_test_file = open(testlabelfile, 'r')

        # Create empty lists
        x_train = []
        y_train = []
        x_test = []
        y_test = []

        # Mapping table for classes
        labels = {
            1: 'Hover',
            2: 'Impact (Front Left)',
            3: 'Impact (Front Right)',
            4: 'Impact (Back Left)',
            5: 'Impact (Back Right)',
            6: 'Gust (from Left)',
            7: 'Gust (from Right)',
            8: 'Gust (from front)'
        }

        i = 0
        # Loop through datasets
        for x in x_train_file:
            x_train.append([float(ts) for ts in x.split()])
        for y in y_train_file:
            y_train.append(int(y.rstrip('\n')))

        for x in x_test_file:
            x_test.append([float(ts) for ts in x.split()])

        for y in y_test_file:
            y_test.append(int(y.rstrip('\n')))

        #close data files
        x_train_file.close()
        y_train_file.close()
        x_test_file.close()
        y_test_file.close()

        # Convert to numpy for efficiency

        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        ##plot train data
        #plt.figure(figsize=(11,7))
        #colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD',
        #          '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27',
        #          '#D62728','#2C9F2C']
        #for i, r in enumerate([0,1,2,3,5,6,7,8,9,10,11,12]):
        #    plt.subplot(7,2,i+1)
        #    plt.plot(x_train[r], label=labels[y_train[r]], color=colors[i], linewidth=2)
        #    plt.xlabel('Samples @50Hz')
        #    plt.legend(loc='upper left')
        #    plt.tight_layout()
        #
        ##Plot Test data
        #plt.figure(figsize=(11,7))
        #colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD',
        #          '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27']
        #for i, r in enumerate([0,1,2,3,4,5]):
        #    plt.subplot(3,2,i+1)
        #    plt.plot(x_test[r], label=labels[y_test[r]], color=colors[i], linewidth=2)
        #    plt.xlabel('Samples @50Hz')
        #    plt.legend(loc='upper left')
        #    plt.tight_layout()

        #Analyze dataset
        print('Algorithm running for param %s with k value %i' %
              (dataparam, para_k[dataparam]))
        m = KnnDtw(n_neighbors=para_k[dataparam], max_warping_window=100)
        m.fit(x_train, y_train)
        label, proba = m.predict(x_test)
        #get the weight for this parameter
        if param_weights == None:
            param_labels.append(label)  #if we don't have weights do this
        else:
            weight = [para_weights[dataparam]]
            param_labels.append(list(zip(
                label, weight * len(label))))  #a tuple list of (label, weight)

    param_labels = np.array(param_labels)
    if param_weights == None:
        para_mode, para_count = stats.mode(param_labels)
        para_mode = np.reshape(para_mode, (para_mode.shape[1], ))
    else:  #for weights
        para_mode = [
            0
        ] * param_labels.shape[1]  #a zero array to represent final label value
        for i in range(param_labels.shape[1]):
            mode_count = [0] * len(
                labels
            )  #an array representing how frequent each label was used to classify a time series
            col = param_labels[:, i]
            for p in col:
                mode_count[int(p[0] - 1)] += p[1]
            para_mode[i] = mode_count.index(
                max(mode_count)
            ) + 1  #the the label that was used most frequently as the overall label
            #para_mode = np.reshape(para_mode,(para_mode.shape[1],))

    #Using mode to see which classification was the most frequent for each data from all parameters used
    #k_val = list(range(1,11))
    #k_fold_cross_val(k_val,x_train,y_train,6)

    #Classification report
    """ASSUMPTION: 
        We're trying to see accuracy of labelling as a result of multi param voting, but 
        we are only comparing to one y_test belonging to one (last) parameter with the current implementation
        we're assuming that y_test is the same across all param which builds on the assumption that
        train/test data for all param are from the same time period!
    """
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.metrics import precision_recall_fscore_support as score
    print(
        classification_report(y_test,
                              para_mode,
                              target_names=[l for l in labels.values()]))

    #Confusion Matrix
    conf_mat = confusion_matrix(para_mode, y_test)

    fig = plt.figure(figsize=(8, 8))
    width = np.shape(conf_mat)[1]
    height = np.shape(conf_mat)[0]

    res = plt.imshow(np.array(conf_mat),
                     cmap=plt.cm.summer,
                     interpolation='nearest')
    for i, row in enumerate(conf_mat):
        for j, c in enumerate(row):
            if c > 0:
                plt.text(j - .2, i + .1, c, fontsize=16)

    #cb = fig.colorbar(res)
    plt.title('Confusion Matrix for ' +
              ', '.join([name for name in param_list]))
    plt.xlabel('Data')
    plt.ylabel('ML Identification')
    _ = plt.xticks(range(9), [l for l in labels.values()], rotation=90)
    _ = plt.yticks(range(9), [l for l in labels.values()])
    #print how long this function ran
    print("Runtime was %s seconds" % (time.time() - start_time))
コード例 #8
0
labels.extend([NOT_FALL] * (len(test_files) - len(labels)))   

results = [list(), list(), list(), list(), list()]
certainty = [list(), list(), list(), list(), list()]
times = [list(), list(), list(), list(), list()]
for i, file in enumerate(test_files):
    print("Predicting for", file)
    data = IMUData()
    with open(file, 'r') as f:
        for line in f:
            data.append(line)
    a = [array(data.kalmanX), array(data.kalmanY), array(data.x), array(data.y), array(data.z)]
    for j in range(1,6):    # testing subsample_step 1 through 5
        model.subsample_step = j
        t = time.time()
        result = model.predict(a)
        times[j-1].append(time.time() - t)
        results[j-1].append(labelResult(labels[i], result.mode[0]))
        certainty[j-1].append(result.count[0][0] / 5)

csv_name = ".\\test results\\test" + str(round(time.time())) + ".csv"
headers = ["train","test","step","accuracy","precision","recall","certainty","time"]
with open(csv_name, "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    for i in range(0,5):
        acc, pre, rec = accuracy(results[i])
        writer.writerow([
            str(len(train_files)),
            str(len(test_files)),
            str(i + 1),