예제 #1
0
def run_knn_classifier(knn_ctor,
                       k: int,
                       m=0,
                       v=-1,
                       normlize_data=False,
                       features_subset=None) -> KNNResults:
    train_dataset_path = "train.csv"
    test_dataset_path = "test.csv"
    train_samples, train_labels, _ = dl.extract_dataset(train_dataset_path)
    test_samples, test_labels, _ = dl.extract_dataset(test_dataset_path)
    if normlize_data:
        minmaxNormalization(train_samples, test_samples)  # Data Normalization
    knn_classifier = None
    if m == 0:
        if v == -1:
            knn_classifier = knn_ctor(k)
        else:
            knn_classifier = knn_ctor(k, 0, v)
    else:
        if v == -1:
            knn_classifier = knn_ctor(k, m)
        else:
            knn_classifier = knn_ctor(k, m, v)
    knn_classifier.fit(train_samples, train_labels, features=features_subset)
    knn_predict = knn_classifier.predict(test_samples,
                                         features=features_subset)
    _accuracy = metrics.accuracy_score(test_labels, knn_predict)
    _c_m = metrics.confusion_matrix(test_labels, knn_predict)
    _error_w = Error_w(_c_m)
    return KNNResults(accuracy=_accuracy,
                      confusion_matrix=_c_m,
                      error_w=_error_w)
예제 #2
0
def main():
    # training parameter
    result_path = 'results/housingLiR_1.mse'
    model_name = 'housing_shiftAndScale'
    # normalization = Preprocess.zero_mean_unit_var
    normalization = Preprocess.shift_and_scale
    # cols_not_norm = (0,7,12)
    cols_not_norm = []

    # laod and preprocess training data
    training_data = loader.load_dataset('data/housing_train.txt')
    testing_data = loader.load_dataset('data/housing_test.txt')
    Preprocess.normalize_features_all(normalization, training_data[0], testing_data[0], cols_not_norm)


    # start training
    model = rm.LinearRegression()
    model.build(training_data[0], training_data[1])
    training_mse = model.test(training_data[0], training_data[1], util.mse)
    testing_mse = model.test(testing_data[0], testing_data[1], util.mse)
    print 'Error for training data is:'
    print training_mse
    print 'Error for testing data is:'
    print testing_mse

    result = {}
    result['TrainingMSE'] = str(training_mse)
    result['TestingMSE'] = str(testing_mse)
    result['Theta'] = str(model.theta)

    # log the training result to file
    util.write_result_to_file(result_path, model_name, result)
def get_data():
    file_paths = DataLoader.get_all_files('Data')

    X_list, y_list = [], []

    for file_path in file_paths:
        data_frame = pd.read_csv(file_path)

        abstract_text, abstract_labels = DataLoader.extract_abstract_and_labels(data_frame)
        mesh_terms, title = DataLoader.extract_mesh_and_title(data_frame)

        X = []
        y = []

        for i in range(abstract_text.shape[0]):
            abstract_str = abstract_text[i]
            mesh_str = mesh_terms[i]
            title_str = title[i]
            label = abstract_labels[i]

            text = "".join([abstract_str, " ", mesh_str, " ", title_str])

            X.append(text)
            y.append(label)
        X_list.append(X)
        y_list.append(y)

    return X_list, y_list
def get_term_structure_df(current_date:str, start_date:str, end_date:str, root:str, num_contracts:int):
    prices = {}
    syms, relative_map = dl.get_recent_symbols(current_date, root, num_contracts)

    data = {}
    for sym in syms:
        data[sym] = dl.read_data(sym, 'D', None, None)

    tds = pd.date_range(start_date, end_date)

    skipped_days = set({})

    for td in tds:
        td = str(td.date())
        price_day = {}
        for sym in syms:
            relative_sym = sym.split('.')[1][:-4] + '!' + str(relative_map[sym])
            try:
                price_day[relative_sym] = (data[sym].loc[td][sym + '.close'])
            except Exception as inst:
                #print(inst)
                if inst not in skipped_days:
                    skipped_days.add(inst)
                else:
                    continue
        if price_day:
            prices[td] = price_day
    print(skipped_days)
    _df = pd.DataFrame(prices)
    df = _df.transpose()
    return df
예제 #5
0
파일: PB6_test.py 프로젝트: Juncai/CS6140
def main():

    target = 'v2'
    # training parameter
    k = 10  # fold
    layer_thresh = 2
    T = 50
    threshes_path = 'data/spambase.threshes'

    # laod and preprocess training data
    training_data = loader.load_dataset('data/spambase.data')

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)

    # start training
    k_folds = Preprocess.prepare_k_folds(training_data, k)
    tr_data, te_data = Preprocess.get_i_fold(k_folds, 0)
    f_cur = [x[0] for x in tr_data[0]]

    t = dt.DecisionTree()
    if target == 'v1':
        for i in range(100):
            h_y = t.compute_entropy(tr_data[1])
            thresh = threshes[0][30]
            ig = t.compute_ig(f_cur, tr_data[1], thresh, h_y)
    else:
        h_y = t.compute_entropy_v2(tr_data[1])
        thresh = threshes[0][0]
        ig = t.compute_ig_v2(f_cur, tr_data[1], thresh, h_y)
예제 #6
0
 def topNPerClass(self,
                  datasetGenerator,
                  numImages,
                  nList=[1, 3, 5],
                  savePath="./topN.csv"):
     batchGenerator = DataLoader.oneHotWrapper(
         DataLoader.batchLoader(datasetGenerator, batchSize=16))
     numBatches = int(ceil(numImages / 16.0))
     successCount = {}
     totalCount = {}
     percentCount = {}
     for i in range(numBatches):
         imgMat, labMat = next(batchGenerator)
         predMat = self.predict(imgMat, True)
         for j in range(16):
             truth = np.argmax(labMat[j])
             for n in nList:
                 thisNSuccess = successCount.get(n, {})
                 topN = np.argpartition(-predMat[j], n)[:n]
                 if truth in topN:
                     thisNSuccess[truth] = thisNSuccess.get(truth, 0) + 1
                     successCount[n] = thisNSuccess
             totalCount[truth] = totalCount.get(truth, 0) + 1
     #find the percentages
     for key in totalCount:
         for n in nList:
             thisNPercent = percentCount.get(n, {})
             thisNPercent[key] = float((successCount.get(n, {})).get(
                 key, 0.0)) / totalCount.get(key, 0.0)
             percentCount[n] = thisNPercent
     df = pd.DataFrame.from_dict(percentCount)
     df.to_csv(savePath)
     return df
예제 #7
0
def __getDataName(numInputErrors):
    """
    Function asks the user for Data Set on whihc learning has to be done
    :return: Returns the DataLoader Object of specific Data set
    """

    while True:
        if numInputErrors  >  MAX_INPUT_ERROR_ALLOWED:
            raise TooManyInputException()
        try:
            dataChoice = input(Constants.inputDataChoice)
        except SyntaxError:
            continue
        if dataChoice == 1:
            data = DataLoader.MnistDataLoader()
            break
        elif dataChoice == 2:
            data = DataLoader.MnistRotated()
            break
        elif dataChoice == 3:
            data = DataLoader.MnistBackground()
            break
        elif dataChoice == 4:
            data = DataLoader.MnistRandomBackground()
            break
        elif dataChoice == 5:
            data = DataLoader.Cifar10DataLoader()
            break
        else:
            numInputErrors += 1
            print Constants.inputDataChoiceError
    return data
예제 #8
0
def main():
    kernel = c.COSINE
    # training parameter
    result_path = 'results/PB2_spam.acc'
    model_name = 'digits_' + kernel

    tr_data_path = 'data\\digits\\tr_f_l_10.pickle'
    te_data_path = 'data\\digits\\te_f_l_10.pickle'
    # laod and preprocess training data
    tr_data = loader.load_pickle_file(tr_data_path)
    te_data = loader.load_pickle_file(te_data_path)

    # transpose label
    tr_data[1] = np.transpose(tr_data[1])[0]
    te_data[1] = np.transpose(te_data[1])[0]

    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0])
    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0])
    # start training

    st = time.time()

    # start training
    print('{:.2f} Start training.'.format(time.time() - st))

    for r in (0.15, 0.1):
        clf = kNN.kNN(kernel=kernel, dataset=c.DS_DIGITS)
        clf.fit(tr_data[0], tr_data[1])
        tr_pred = clf.predict(tr_data[0], r=r)
        te_pred = clf.predict(te_data[0], r=r)

        tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
        te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

        print('{} Final results with kernel {} and r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, r, tr_acc, te_acc))
예제 #9
0
 def test(self, datasetGenerator, numImages):
     batchGenerator = DataLoader.oneHotWrapper(
         DataLoader.batchLoader(datasetGenerator, batchSize=1))
     results = self.model.evaluate_generator(batchGenerator, numImages)
     print("testing complete")
     for i in range(len(results)):
         print("\t{}: {}".format(self.model.metrics_names[i], results[i]))
예제 #10
0
def construct_dataloader_disk():
    # Construct DataLoader
    opt_data_train = {
        #'data_h5': 'miniplaces_128_train.h5',
        'data_root': '../../data/images/',  # MODIFY PATH ACCORDINGLY
        'data_list': '../../data/train.txt',  # MODIFY PATH ACCORDINGLY
        'load_size': load_size,
        'fine_size': fine_size,
        'data_mean': data_mean,
        'randomize': True
    }
    opt_data_val = {
        #'data_h5': 'miniplaces_128_val.h5',
        'data_root': '../../data/images/',  # MODIFY PATH ACCORDINGLY
        'data_list': '../../data/val.txt',  # MODIFY PATH ACCORDINGLY
        'load_size': load_size,
        'fine_size': fine_size,
        'data_mean': data_mean,
        'randomize': False
    }

    loader_train = DataLoader.DataLoaderDisk(**opt_data_train)
    loader_val = DataLoader.DataLoaderDisk(**opt_data_val)

    return (loader_train, loader_val)
예제 #11
0
    def __init__(self, class_num, batch_size, iters, learning_rate, param):
        self.ClassNum = class_num
        self.BatchSize = batch_size
        self.Iters = iters
        self.LearningRate = learning_rate
        self.target_loss_param = param[0]
        self.domain_loss_param = param[1]
        self.adver_loss_param = param[2]
        Data = DataLoader("office31", source="Amazon", target="Webcam")
        self.SourceData, self.SourceLabel = Data.LoadSource()
        self.TargetData, self.TestData, self.TestLabel = Data.LoadTarget()

        #######################################################################################
        self.source_image = tf.placeholder(tf.float32,
                                           shape=[self.BatchSize, 227, 227, 3],
                                           name="source_image")
        self.source_label = tf.placeholder(
            tf.float32,
            shape=[self.BatchSize, self.ClassNum],
            name="source_label")
        self.target_image = tf.placeholder(tf.float32,
                                           shape=[self.BatchSize, 227, 227, 3],
                                           name="target_image")
        self.Training_flag = tf.placeholder(tf.bool,
                                            shape=None,
                                            name="Training_flag")
        self.KeepProb = tf.placeholder(tf.float32, name='keep_prob')
예제 #12
0
def run():
    n = 100000
    df_android = DataLoader.load_data(r'Data\df_Ready_Data2.csv')
    print('finish read data.')
    # df_android = DataPreProcessor.drop_columns(df_android, ['user_isp_new'])
    df_android = df_android.dropna(how='any')
    col_label_encode = ['user_state', 'user_isp', 'app_cat', 'app_domain']
    df_android = DataEncoder.label_encoder(df_android, col_label_encode)
    col_one_hot_encode = [
        'device_maker', 'geo_location', 'day_of_week', 'part_of_day'
    ]
    # df_android = DataEncoder.encode_one_hot(df_android, list(df_android.columns[1:]))
    df_android = DataEncoder.encode_one_hot(df_android, col_one_hot_encode)
    print('finish encode data.')
    x_train, x_test, y_train, y_test = DataLoader.split_data(
        df_android, 'click')
    print('finish split to train and test')
    x_train, y_train = DataLoader.under_sampling_majority(
        x_train, y_train, 'click')
    print('finish sample data')

    print('start training classifiers...')
    for classifier_name, classifier in eval_classifiers.items():
        trained_model = ModelProcessor.train_model(classifier_name, classifier,
                                                   x_train, y_train)
        trained_models[classifier_name] = trained_model
    print('Finish train classifiers.')
    print('start evaluating classifiers...')
    for model_name, model in trained_models.items():
        predictions = ModelProcessor.predict_samples(model_name, model, x_test)
        score = Evaluator.evaluate_performance_metric('auc', predictions,
                                                      y_test)
        print(f'metric auc- score for [{model_name}]: [{score}]')
예제 #13
0
def main():
    st = time.time()
    # training parameter
    result_path = 'results/PB2_A_spam_polluted_NB_Gaussian.acc'
    model_name = 'spam_'
    train_data_path = 'data/spam_polluted/train/data.pickle'
    test_data_path = 'data/spam_polluted/test/data.pickle'

    tr_data = loader.load_pickle_file(train_data_path)
    te_data = loader.load_pickle_file(test_data_path)
    print('{:.2f} Data loaded!'.format(time.time() - st))

    # start training
    print('{:.2f} Building model...'.format(time.time() - st))
    model = m.NBGaussian()
    model.build(tr_data[0], tr_data[1])

    print('{:.2f} Predicting...'.format(time.time() - st))
    tr_pred = model.predict(tr_data[0])
    te_pred = model.predict(te_data[0])

    print('{:.2f} Calculating results...'.format(time.time() - st))
    tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
    te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]


    print('{} Final results. Train acc: {}, Test acc: {}'.format(time.time() - st, tr_acc, te_acc))

    result = {}
    result['TrainingAcc'] = tr_acc
    result['TestingAcc'] = te_acc

    # log the training result to file
    util.write_result_to_file(result_path, model_name, result, True)
예제 #14
0
def hillClimb(domain, costf):
    tmp = DataLoader.Result("")
    tmp.red = random.sample(RedBalls, 6)
    tmp.blue = random.sample(BlueBalls, 1)[0]
    while 1:
        neighbors = []
        for j in range(len(domain)):
            if tmp.blue > domain[j].blue:
                n = DataLoader.Result("")
                n.blue = tmp.blue - 1
                for i in range(len(tmp.red)):
                    n.red[i] = tmp.red[i] - 1
                neighbors.append(n)
            if tmp.blue < domain[j].blue:
                n = DataLoader.Result("")
                n.blue = tmp.blue + 1
                for i in range(len(tmp.red)):
                    n.red[i] = tmp.red[i] + 1
                neighbors.append(n)
        current = costf(tmp, domain)
        best = current
        for j in range(len(neighbors)):
            cost = costf(neighbors[j], domain)
            if cost > best:
                best = cost
                tmp = neighbors[j]

        if best == current:
            break
    print best
    print tmp
    return tmp
예제 #15
0
 def __set_stopwords(self):
     data_loader = DataLoader('./vietnamese-stopwords.txt')
     content = data_loader.read_file()
     stop_words_set = set(line.strip() for line in content.split("\n"))
     stop_words_set = set(stop_word.strip().replace(" ", "_")
                          for stop_word in stop_words_set)
     self.stop_words = stop_words_set
예제 #16
0
def run_pipeline_with_pretrained_doc2vec():
    all_documents = DataLoader.load_all_documents()
    adjacency_matrix_references_all_documents = DataLoader.load_adjacency_matrix_all_documents(
    )
    # Load model
    model = DataLoader.load_model()
    # Runs HDBSCAN, returns a list of labels (a label for each documents. -1 == outlier)
    labels = Clustering.run_hdbscan(model=model,
                                    min_cluster_size=4,
                                    min_samples=4)

    # Extracts the documents which have been clustered such that we have no outliers
    # Mask denotes the ones to include and exclude. Labels of the clustered documents and the clustered documents
    mask, labels_subset, clustered_documents = Clustering.extract_clustered_documents(
        all_documents, labels)

    # Creates the adjacency matrix for references between clusters
    cluster_references_adjacency = Clustering.create_adjacency_matrix_for_clusters(
        mask=mask,
        labels=labels_subset,
        adjacency_references_all_documents=
        adjacency_matrix_references_all_documents)

    # k-nearest undirected adjacency
    cluster_references_adjacency = Clustering.make_adjacency_matrix_undirected(
        cluster_references_adjacency, k=3)
    DataLoader.save_data(cluster_references_adjacency, clustered_documents,
                         labels_subset)

    # Creates the graph and sets up the interactive webpage showing the graph
    visualization.doc_to_vec_visualize(documents=clustered_documents,
                                       adj_matrix=cluster_references_adjacency,
                                       labels=labels_subset)
예제 #17
0
def loadData(dir,t,o,l):
        
        # Loading Twitter training features
        global tw_train_all
        global tw_train_all_recent
        global tw_train_base
        global tw_train_base_recent

        tw_train_all = dl.load_tw_train_all(dir,t,o,l)
        tw_train_all_recent = dl.load_tw_train_all_recent(dir,t,o,l)
        tw_train_base = dl.load_tw_train_base(dir,t,o,l)
        tw_train_base_recent = dl.load_tw_train_base_recent(dir,t,o,l)


        # Loading Youtube training features
        global yt_train_all
        global yt_train_all_recent
        global yt_train_base
        global yt_train_base_recent
        
        yt_train_all = dl.load_yt_train_all(dir,t,o,l)
        yt_train_all_recent = dl.load_yt_train_all_recent(dir,t,o,l)
        yt_train_base = dl.load_yt_train_base(dir,t,o,l)
        yt_train_base_recent = dl.load_yt_train_base_recent(dir,t,o,l)

        # Loading Labeling data
        global popular_train
        global popular_train_recent
        global viral_train
        global viral_train_recent

        labeled = dl.load_labeling(dir,t,o,l)
        popular_train = np.array(labeled[0])
        viral_train = np.array(labeled[1])

        # Create viral and popular list
        global vap_train
        vap_train=[]
        for i in range(popular_train.size):
                if ((popular_train[i]==1) and (viral_train[i]==1)):
                        vap_train.append(1)
                else:
                        vap_train.append(0)

        vap_train = np.array(vap_train)

        labeled_recent = dl.load_labeling_recent(dir,t,o,l)
        popular_train_recent = np.array(labeled_recent[0])
        viral_train_recent = np.array(labeled_recent[1])

        # Create viral and popular list
        global vap_train_recent
        vap_train_recent = []
        for i in range(popular_train_recent.size):
                if ((popular_train_recent[i]==1) and (viral_train_recent[i]==1)):
                        vap_train_recent.append(1)
                else:
                        vap_train_recent.append(0)

        vap_train_recent = np.array(vap_train_recent)
예제 #18
0
def build_model(training_data, config):
    '''
    Build model from the config and training data
    '''
    m_type = config[c.CLSFR_TYPE]
    if m_type == c.DT_WITH_IG:
        # for decision tree
        # load thresholds
        threshs = loader.load_arrays(config[c.THRESHS])

        tree = Tree.Tree()
        tree.build(utils.split_on_ig, training_data[0],
                   training_data[1], threshs, config[c.TERM_CON], int(config[c.TERM_THRESH]))
        return tree
    elif m_type == c.REGRESSION_TREE:
        # for regression tree
        # load thresholds
        threshs = loader.load_arrays(config[c.THRESHS])

        tree = Tree.Tree()
        tree.build(utils.split_on_mse, training_data[0],
                  training_data[1], threshs, config[c.TERM_CON], float(config[c.TERM_THRESH]))
        return tree
    elif m_type == c.REGRESSION:
        # for linear regression
        reg_model = rmodel.Regression()
        reg_model.build(training_data[0], training_data[1])
        return reg_model
예제 #19
0
 def test_update(self):
     print("This program created by Phuong Pham to test an update dataset ")
     record = DataLoader.readRecord(
         self.data, 1)  # test the first  record from the dataset
     DataLoader.updateRecord(
         record, "Value",
         100)  # update new value equal 100 for field 'Value'
     self.assertEqual(record['Value'], 10)  # test for field 'Value'
예제 #20
0
 def __init__(self, windowSize, paramIndex, threshold):
     self.windowSize = windowSize  # 1 = 5 min
     self.maxEpoch = 0
     self.paramIndex = paramIndex
     self.learningRate = 0
     self.threshold = threshold
     self.normalData = DataLoader.NormalDataLoader(self.paramIndex, 'train')
     self.unstableData = DataLoader.UnstableDataLoader(self.paramIndex, 'test')
예제 #21
0
def test():

    # laod and preprocess training data
    # tr_data = loader.load_pickle_file(tr_data_path)
    te_data= loader.load_pickle_file(te_data_path)
    model = loader.load_pickle_file(model_path)
    # te_pred_dict = loader.load_pickle_file(te_pred_dict_path)

    test_pred_dict = {}
    for i in range(9):
        test_pred_dict[i] = {}
        for j in range(i + 1, 10):
            if i == j:
                continue
            # get training data for this class
            clf = model[i][j]
            te_pred = clf.predict(te_data[0])
            test_pred_dict[i][j] = te_pred


    te_n = len(te_data[1])
    te_pred = np.zeros((1, te_n))[0]

    for i in range(te_n):
        votes = np.zeros((10,), dtype=np.int)
        for j in range(9):
            for k in range(j):
                votes[j] += 1 if test_pred_dict[k][j][i] == -1 else 0
            for kk in test_pred_dict[j]:
                votes[j] += 1 if test_pred_dict[j][kk][i] == 1 else 0
        count = np.bincount(votes)
        if count[-1] == 1:
            te_pred[i] = votes.argmax()
        else:
            te_pred[i] = votes.argmax()
            tie_ind = [votes.argmax()]
            cc = 0
            for ind_v, v in enumerate(votes):
                if v == votes.max():
                    if cc == 1:
                        tie_ind.append(ind_v)
                        break
                    else:
                        cc += 1
            te_pred[i] = tie_ind[0] if test_pred_dict[tie_ind[0]][tie_ind[1]][i] == 1 else tie_ind[1]
            print('{} Tie! {} wins.'.format(count[-1], te_pred[i]))


    acc = 0
    acc_n = 0
    for ind_l, l in enumerate(te_data[1]):
        acc += 1 if l == te_pred[ind_l] else 0

    acc /= te_n
    # acc = (te_data[1] == te_pred).sum() / te_n

    print('Acc: {}'.format(acc))
예제 #22
0
    def __init__(self, Sfilename, model='Net1_FFN_v7', verbose=False):
        self.index = 0
        self.settingfuncs = [self.setting1, self.setting2, self.setting3,
                             self.setting4, self.setting5, self.setting6,
                             self.setting7, self.setting8]
        if isinstance(model, str):
            self.modelname = model

            self.model = FFN(model)
            self.model.Load(verbose=verbose)

        elif isinstance(model, FFN):
            self.model = model
            self.modelname = self.model.name

        mask1, pmask1 = self.model.apply_mask(Sfilename)

        rgb, self.TitleStr = Vis.FalseColour(Sfilename, False)

        scn = DL.scene_loader(Sfilename)
        scn.load(['bayes_in', 'probability_cloud_single_in'])
        bmask = DL.upscale_repeat(scn['bayes_in'].values).astype('int')
        bmask = 1 - ((bmask & 2) / 2)
        bpmask = DL.upscale_repeat(scn['probability_cloud_single_in'].values)

        self.im1 = plt.imshow(rgb)
        plt.title('False colour image\n' + self.TitleStr)

        self.im2 = plt.imshow(mask1, cmap='Blues')
        self.im2.set_visible(False)

        bmask = DL.extract_mask(Sfilename, 'bayes_in', 2)
        self.im3 = plt.imshow(bmask, cmap='Reds')
        self.im3.set_visible(False)

        mask1 = mask1.astype('bool')
        temp = np.copy(rgb)
        temp[~mask1, :] = 254 / 255, 253 / 255, 185 / 255
        self.im4 = plt.imshow(temp)
        self.im4.set_visible(False)

        rgb[mask1, :] = 74 / 255, 117 / 255, 50 / 255
        self.im5 = plt.imshow(rgb)
        self.im5.set_visible(False)

        self.im6 = plt.imshow(1 - pmask1, cmap='Oranges')
        self.im6.set_visible(False)

        self.im7 = plt.imshow(1 - bpmask, cmap='Reds')
        self.im7.set_visible(False)

        maskdiff = bmask - mask1
        self.im8 = plt.imshow(maskdiff, cmap='bwr')
        self.im8.set_visible(False)

        self.cbset = False
        self.cb = None
예제 #23
0
    def get_cached_by_extent(self, fn, extent, buffer):
    
        geom = DataLoader.extent_to_transformed_geom(extent, "epsg:2794")
        geom = shapely.geometry.shape(geom)
        new_fn = None
        for i, boundary_shape in enumerate(highres_boundary_shapes):
            if boundary_shape.contains(geom):
                new_fn = highres_fns[i]
                break
        
        if new_fn is None:
            print("No intersections")
            new_fn = fn
            new_fn = new_fn.replace("esri-naip/", "full-usa-output/1_3_2019/")[:-4] + "_prob.tif"

        f = rasterio.open(new_fn, "r")
        geom = DataLoader.extent_to_transformed_geom(extent, f.crs["init"])
        pad_rad = 15 # TODO: this might need to be changed for much larger inputs
        buffed_geom = shapely.geometry.shape(geom).buffer(pad_rad)
        minx, miny, maxx, maxy = buffed_geom.bounds
        geom = shapely.geometry.mapping(shapely.geometry.box(minx, miny, maxx, maxy, ccw=True))
        out_image, out_transform = rasterio.mask.mask(f, [geom], crop=True, nodata=-1)
        src_crs = f.crs.copy()
        f.close()
        
        dst_crs = {"init": "EPSG:%s" % (extent["spatialReference"]["latestWkid"])}
        dst_transform, width, height = rasterio.warp.calculate_default_transform(
            src_crs,
            dst_crs,
            width=out_image.shape[2], height=out_image.shape[1],
            left=buffed_geom.bounds[0],
            bottom=buffed_geom.bounds[1],
            right=buffed_geom.bounds[2],
            top=buffed_geom.bounds[3],
            resolution=1
        )

        dst_image = np.zeros((out_image.shape[0], height, width), np.uint8)
        rasterio.warp.reproject(
            source=out_image,
            destination=dst_image,
            src_transform=out_transform,
            src_crs=src_crs,
            dst_transform=dst_transform,
            dst_crs=dst_crs,
            resampling=rasterio.warp.Resampling.nearest
        )
        
        # Calculate the correct padding
        w = extent["xmax"] - extent["xmin"]
        padding = int(np.round((dst_image.shape[1] - w) / 2))

        dst_image = np.rollaxis(dst_image, 0, 3)
        dst_image = dst_image[padding:-padding, padding:-padding, :]

        return dst_image / 255.0, "highres_prob_predictions_quantized_compressed_5_11_2018"
 def __init__(self, windowSize, maxEpoch, paramIndex, learningRate, threshold):
     self.windowSize = windowSize
     self.maxEpoch = maxEpoch
     self.paramIndex = paramIndex
     self.learningRate = learningRate
     self.threshold = threshold
     self.embeddingDim = 128
     self.normalData = DataLoader.NormalDataLoader(self.paramIndex, 'train')
     self.unstableData = DataLoader.UnstableDataLoader(self.paramIndex, 'test')
     self.wantToShuffle = False
     self.statistics = {}
예제 #25
0
def main():
    parser = GooeyParser(prog="example_progress_bar_1")
    parser.add_argument('FolderChooser',
                        help="name of the file to process",
                        widget='DirChooser')
    parser.add_argument('FolderDist',
                        help="name of the file to process",
                        widget='DirChooser')
    args = parser.parse_args(sys.argv[1:])

    # Loading directory data
    dl.start(args.FolderChooser, args.FolderDist)
예제 #26
0
 def setUp(self):
     if not hasattr(self, 'students'):
         self.students = DataLoader.load_students_from_file(
             'resources/students.json')
         self.classes = DataLoader.load_classes_from_file(
             'resources/classes.json')
         self.students2 = DataLoader.load_students_from_file(
             'resources/students2.json')
         self.classes2 = DataLoader.load_classes_from_file(
             'resources/classes2.json')
         self.students3 = DataLoader.load_students_from_file(
             'resources/students3.json')
예제 #27
0
def get_cs(data_path, cs_path):
    # dp compute cheat sheet
    cs = None
    if os.path.isfile(cs_path):
        cs = loader.load_pickle_file(cs_path)
        print('CS loaded.')
    else:
        print('Start compute cs.')
        data = loader.load_pickle_file(data_path)
        cs = dp_compute_cs(data[0])
        loader.save(cs_path, cs)
        print('CS saved.')
    return cs
예제 #28
0
def start(port=5000):
    ql.initialize_static_questions(TREE)
    app.run(port=port)

    request_endpoint = '{0}/me/messenger_profile'.format(bot.graph_url)
    response = requests.post(
        request_endpoint,
        params=bot.auth_args,
        data=json.dumps({"get_started": {"payload": "first"}}),
        headers={'Content-Type': "application/json"}
    )
    result = response.json()
    Bot.send_raw(response)
예제 #29
0
def LoadData():
    Data = DL.GetData(transform=False, includeDraw=False)
    TData = []

    bits = 7

    for entry in Data:
        reshaped = TL.FENtoBits(entry[0], bits, True)
        reshaped = np.reshape(reshaped, (8, 8, bits))
        TData.append([reshaped, entry[1]])

    TData = DL.StartifiedData(TData)
    return TData
예제 #30
0
def pred_patch():
    ''' Method called for POST `/predPatch`'''
    bottle.response.content_type = 'application/json'
    data = Dict(bottle.request.json)
    data["remote_address"] = bottle.request.client_ip

    SESSION_HANDLER.get_session(bottle.request.session.id).add_entry(
        data)  # record this interaction

    # Inputs
    extent = data.extent
    dataset = data.dataset
    name_list = [item.name for item in dataset.class_list]

    # Load the input data sources for the given tile
    if dataset.metadata.id not in DATASETS:
        raise ValueError(
            "Dataset doesn't seem to be valid, do the datasets in js/tile_layers.js correspond to those in TileLayers.py"
        )

    loaded_query = DATASETS[
        dataset.metadata.id]["data_loader"].get_data_from_extent(extent)
    SESSION_HANDLER.get_session(
        bottle.request.session.id).current_transform = (
            loaded_query["src_crs"], loaded_query["src_transform"])

    #   Run a model on the input data
    model = SESSION_HANDLER.get_session(bottle.request.session.id).model
    output = model.run(loaded_query["src_img"], extent, False)
    loaded_query["src_img"] = None  # save memory
    assert len(
        output.shape
    ) == 3, "The model function should return an image shaped as (height, width, num_classes)"
    assert (
        output.shape[2] < output.shape[0] and output.shape[2] < output.shape[1]
    ), "The model function should return an image shaped as (height, width, num_classes)"  # assume that num channels is less than img dimensions

    #   Warp output to EPSG:3857
    output, output_bounds = DL.warp_data_to_3857(output,
                                                 loaded_query["src_crs"],
                                                 loaded_query["src_transform"],
                                                 loaded_query["src_bounds"])

    # ------------------------------------------------------
    # Step 5
    #   Convert images to base64 and return
    # ------------------------------------------------------
    img_soft = np.round(utils.class_prediction_to_img(output)).astype(np.uint8)
    data["output_soft"] = DL.encode_rgb(img_soft)
    bottle.response.status = 200
    return json.dumps(data)
예제 #31
0
def random_select_data(tr_save_path, sel_tr_save_path, percent):
    all_tr = loader.load_pickle_file(tr_save_path)

    tr_l_ind_dict = {}
    selected_tr_data = [[], []]
    for i in range(10):
        tr_l_ind_dict[i] = [l_ind for l_ind, l in enumerate(all_tr[1]) if l == i]
    for i in range(10):
        i_n = len(tr_l_ind_dict[i])
        pick_n = int(percent * i_n)
        cur_pick_ind = np.random.choice(tr_l_ind_dict[i], pick_n, replace=False).tolist()
        selected_tr_data[0].extend([x for x_ind, x in enumerate(all_tr[0]) if x_ind in cur_pick_ind])
        selected_tr_data[1].extend([y for y_ind, y in enumerate(all_tr[1]) if y_ind in cur_pick_ind])
    loader.save(sel_tr_save_path, selected_tr_data)
def load_dataset(dataset, architecture, batch_size, device, path):
    if dataset == "imdb":
        if architecture == "cnn":
            data = DataLoader.IMDB_CNN_CUSTOM(batch_size, device, path)
        elif architecture == "lstm":
            data = DataLoader.IMDB_LSTM(batch_size, device, path)

    elif dataset == "agnews":
        data = DataLoader.AGNEWS(batch_size, device, path)

    else:
        raise ValueError(dataset + "is not supported")

    return data
    def evaluate_data(self):
        for ii, batch in enumerate(self.batch_data_dict):
            print("[Batch %s]: %d / %d" % (batch, ii + 1, self.total_batches))
            data_dict = self.batch_data_dict[batch]
            for job_id, job in enumerate(data_dict):

                self.agents = {}
                path, format = data_dict[job]['path'], data_dict[job][
                    'data_format']
                if "log" in path.split("/")[-1]: continue

                data_loader = DataLoader(path, format)
                data_scale = format_dict[format]['data_type_scale']
                capture_freq = format_dict[format]['capture_freq']
                prev_time = 0
                curr_time = 0
                time_frame_done = False
                normalized_time_step = 0
                curr_agents = {}
                for _ in tqdm(range(data_loader.total_time_steps + 1)):
                    if data_loader.done:
                        for evaluation_metric in self.evaluation_metrics[
                                batch][job_id]:
                            evaluation_metric.evaluate(self.agents,
                                                       curr_agents, prev_time)
                        break
                    data = data_loader.step_data()
                    time_step, agent_id, x, y = self.parse_into_state(
                        data, format)
                    curr_time = time_step
                    if ((curr_time != prev_time and _ != 0)
                            or _ == data_loader.total_time_steps):
                        if normalized_time_step > ignored_time_steps:
                            for evaluation_metric in self.evaluation_metrics[
                                    batch][job_id]:
                                evaluation_metric.evaluate(
                                    self.agents, curr_agents, prev_time)
                        curr_agents = {}
                        normalized_time_step += 1
                    if agent_id not in self.agents:
                        self.agents[agent_id] = Agent(agent_id, [x, y],
                                                      curr_time, capture_freq)
                    else:
                        self.agents[agent_id].update_state([x, y], curr_time)
                    curr_agents[agent_id] = self.agents[agent_id]
                    prev_time = curr_time
                self.write_stats(job_id, job, batch)
                self.write_batch_stats(batch)

            self.df_dict = {}
예제 #34
0
def main():
    is_sklearn = False
    # kernel = c.COSINE
    # kernel = c.GAUSSIAN
    kernel = c.POLY
    # training parameter
    result_path = 'results/PB2_spam.acc'
    model_name = 'digits_' + kernel
    model_path = 'data/PB1_B_digits_sk_Gaussian_1.model'

    # tr_data_path = 'data\\digits\\tr_f_l.pickle'
    # te_data_path = 'data\\digits\\te_f_l.pickle'
    tr_data_path = 'data\\digits\\tr_f_l_10.pickle'
    te_data_path = 'data\\digits\\te_f_l_10.pickle'
    # laod and preprocess training data
    tr_data = loader.load_pickle_file(tr_data_path)
    te_data = loader.load_pickle_file(te_data_path)

    # transpose label
    tr_data[1] = np.transpose(tr_data[1])[0]
    te_data[1] = np.transpose(te_data[1])[0]

    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0])
    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0])


    # start training
    models = []
    st = time.time()

    # start training
    print('{:.2f} Start training.'.format(time.time() - st))

    for k in (1, 3, 7):
        if not is_sklearn:
            clf = kNN.kNN(kernel=kernel)
            clf.fit(tr_data[0], tr_data[1])
            tr_pred = clf.predict(tr_data[0], k=k)
            te_pred = clf.predict(te_data[0], k=k)
        else:
            clf = KNeighborsClassifier(n_neighbors=k, metric=cosine_distances)
            clf.fit(tr_data[0], tr_data[1])
            tr_pred = clf.predict(tr_data[0])
            te_pred = clf.predict(te_data[0])

        tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
        te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]
        models.append(clf)
        print('{} Final results with kernel {} and k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, k, tr_acc, te_acc))
예제 #35
0
    def __init__(self, run=True):

        if run==True:
            self.loader = DataLoader()  

            while True:
                t = datetime.datetime.now().strftime("%H:%M:%S")
                if not ((t>"09:30:00" and t<"11:30:00") or (t>"13:00:00" and t<"15:00:00")):
                    print "Not Trading Time, Resting ......"
                else:
                    self.loader.load() 
                    m = self.monitor(self.loader.option_rt, self.loader.index_future_plus_dvd(self.loader.index_future_rt),
                                     self.loader.etf_index_rt) 
                    self.display(m, "Option Complex Monitor") 
                time.sleep(10)
예제 #36
0
def update_price_value(exchanges, symbol, freq):
    if symbol is None:
        raise PreventUpdate

    start_dt = None
    end_dt = None
    df = DataLoader.read_data(symbol, freq, start_dt, end_dt)
    df_summary = dashlib.generate_summary_table(df, symbol)
    data = df_summary.to_dict('records')

    columns = [{'name': k.capitalize(), 'id': k} for k in data[0].keys()]

    fig = go.Figure(data=[
        go.Candlestick(x=df.index,
                       open=df['open'],
                       high=df['high'],
                       low=df['low'],
                       close=df['close'],
                       increasing_line_color='red',
                       decreasing_line_color='green')
    ])

    fig.update_layout(title=symbol + ' ' + 'klines',
                      xaxis_rangeslider_visible=True,
                      height=600)
    return fig, columns, data
예제 #37
0
def app_main():
    try:
        logger = lgr.Logger()
        print("--------application starting--------------")
        logger.info("--------application starting--------------")
        print("--------loading data----------------------")
        logger.info("--------loading data----------------------")
        dl = dataLoader.DataLoader()
        train_X, test_X = train_test_split(dl.DataFrame.copy(),
                                           test_size=0.3,
                                           random_state=42)
        # label = train_X["median_house_value"].copy()
        # housing = train_X.drop("median_house_value", axis=1)
        housing = test_X.drop("median_house_value", axis=1)
        #label_test = test_X["median_house_value"].copy()
        numeric_cols = list(housing.columns.values)
        category_cols = ["ocean_proximity"]
        numeric_cols.remove("ocean_proximity")
        dp = preproc.DataPreProcess(numeric_cols, category_cols)
        print("--------processing data-------------------")
        dataProcessed = dp.getProcessedData(housing)
        print(dataProcessed.shape)
        configMagt = cmfmagt.ConfigManager()
        engine = mlEngine.ModelEngine()
        print("------------loading model------------------")
        bestModel = engine.loadML(configMagt.config["APPSETTING"]["ml_path"])
        result = bestModel.predict(dataProcessed)
        print(result.shape)
    except Exception as e:
        print("Error : ", str(e))
예제 #38
0
def getinputsCNN(Sreference, indices):
    row = (indices / 3000).astype(int)
    col = (indices % 3000).astype(int)
    if type(Sreference) == str:
        scn = DL.scene_loader(Sreference)
    else:
        scn = Sreference

    scn.load(['S1_an'])
    S1 = np.nan_to_num(scn['S1_an'].values)
    data = []

    for i in range(len(row)):
        coords = get_coords(row[i], col[i], 50, True)
        star = []
        for arm in coords:
            if len(arm) > 0:
                arm = np.array(arm)
                arm_row = arm[:, 0]
                arm_col = arm[:, 1]
                arm_data = S1[arm_row, arm_col]
                star.append(arm_data)
            else:
                star.append([])
        data.append(star)

    return data
def get_distance(directory) :
    '''returns the distance in m'''
    file = directory + '/info.dat'
    info = dl.load(file)[0]
    distancestr = info['Distance']
    distance = 0.01*float(distancestr.replace('cm',''))
    return distance
예제 #40
0
def predict():
    """
    An example of how to load a trained model and use it
    to predict labels.
    """
    # load the saved model
    classifier = pickle.load(open("best_model.p", "rb"))

    # compile a predictor function
    predict_model = theano.function(
        inputs=[classifier.input],
        outputs=classifier.y_pred)

    # We can test it on some examples from test test
    dataset = 'mnist_train.csv'
    datasets = DataLoader.load_kaggle_mnist(dataset)

    test_set_x, test_set_y = datasets[2]
    print(type(test_set_x))
    print(type(test_set_y))
    test_set_x = test_set_x.get_value()
    test_set_y = test_set_y.eval()


    predicted_values = predict_model(test_set_x[20:30])
    print("Sample Neural Prediction")
    print ("Predicted values for the first 20 examples in test set:")
    print(predicted_values)
    print ("The actual values are")
    print(test_set_y[20:30])
예제 #41
0
def update_volume_value(symbol):
    if symbol is None:
        raise PreventUpdate

    start_dt = None
    end_dt = None
    df = DataLoader.read_data(symbol, 'D', start_dt, end_dt)

    volume_fig_dict = dict({
        'data': [{
            'x': df.index,
            'y': df['volume'],
            'type': 'bar',
            'name': symbol
        }],
        'layout': {
            'title': symbol + ' volume'
        }
    })

    oi_fig_dict = dict({
        'data': [{
            'x': df.index,
            'y': df['close_oi'] - df['open_oi'],
            'type': 'bar',
            'name': symbol
        }],
        'layout': {
            'title': symbol + ' net_oi'
        }
    })

    volume_fig = go.Figure(volume_fig_dict)
    oi_fig_fig = go.Figure(oi_fig_dict)
    return volume_fig, oi_fig_fig
예제 #42
0
def get_balanced_dataset(in_memory=False, TMP_WHOLE_UNBALANCED=False):
    from ActiveLearning.LargeDatasetHandler_AL import LargeDatasetHandler_AL
    import Settings

    # init structures
    import mock
    args = mock.Mock()
    args.name = "test"

    settings = Settings.Settings(args)
    WholeDataset = LargeDatasetHandler_AL(settings)

    # load paths of our favourite dataset!
    import DataLoader, DataPreprocesser, Debugger
    import DatasetInstance_OurAerial

    dataLoader = DataLoader.DataLoader(settings)
    debugger = Debugger.Debugger(settings)

    #h5_file = settings.large_file_folder + "datasets/OurAerial_preloadedImgs_subBAL3.0_1.0_sel2144_res256x256.h5"
    h5_file = settings.large_file_folder + "datasets/OurAerial_preloadedImgs_subBAL3.0_1.0_sel2144_res256x256_SMALLER.h5"

    datasetInstance = DatasetInstance_OurAerial.DatasetInstance_OurAerial(
        settings, dataLoader, "256_cleanManual")

    if not TMP_WHOLE_UNBALANCED:
        # ! this one automatically balances the data + deletes misfits in the resolution
        data, paths = datasetInstance.load_dataset()
        lefts_paths, rights_paths, labels_paths = paths
        print("Paths: L,R,Y ", len(lefts_paths), len(rights_paths),
              len(labels_paths))

    else:
        # ! this one loads them all (CHECK: would some be deleted?)
        paths = datasetInstance.load_dataset_ONLY_PATHS_UPDATE_FROM_THE_OTHER_ONE_IF_NEEDED(
        )
        lefts_paths, rights_paths, labels_paths = paths
        print("Paths: L,R,Y ", len(lefts_paths), len(rights_paths),
              len(labels_paths))

    WholeDataset.initialize_from_just_paths(paths)

    if in_memory:
        assert not TMP_WHOLE_UNBALANCED
        #WholeDataset.keep_it_all_in_memory()
        WholeDataset.keep_it_all_in_memory(h5_file)

    npy_path = settings.large_file_folder + "datasets/OurAerial_preloadedImgs_BALCLASS.npy"

    I_WANT_TO_RECOMPUTE_THE_LABELS = False
    if I_WANT_TO_RECOMPUTE_THE_LABELS:
        assert False  # don't want to mistakenly recompute these ...
        WholeDataset.compute_per_tile_class_in_batches()
        WholeDataset.save_per_tile_class(npy_path)

    WholeDataset.load_per_tile_class(npy_path)

    WholeDataset.report()

    return WholeDataset
예제 #43
0
def compute_feature_mean(features, save_path):
    n, d = np.shape(features)

    means = []
    for i in range(d):

        cur_f = features[:, i]
        means.append(np.nanmean(cur_f))

        # cur_mean = 0
        # for f in features:
        #     if not np.isnan(f[i]):
        #         cur_mean += f[i]
        # means.append(cur_mean / n)
    means = np.array(means)
    loader.save(save_path, means)
    return means
예제 #44
0
    def train(self, X, y, model, batch_generator, n_epochs=50, optim_algo='adam',
              criterion='categorical_crossentropy', save_model=True, verbose=2,
              plot=True, batch_size=64,):

        if optim_algo == 'adam':
            optim_algo = Adam()
        elif optim_algo == 'sgd':
            optim_algo = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
        elif optim_algo == 'adagrad':
            optim_algo = Adagrad()

        self.model.compile(optimizer=optim_algo, loss=criterion)

        loss_train_history = []
        loss_val_history = []
        batch_history = {'f1': [], 'recall': [], 'precision': []}

        for epoch in range(1, n_epochs + 1):
            batch_f1_history = []
            batch_precision_history = []
            batch_recall_history = []

            for X, y in batch_generator.next_batch():
                history = self.model.fit(X, y, nb_epoch=1, batch_size=batch_size,
                                         validation_split=0.2, verbose=0)

                val_loss, loss = history.history['val_loss'][0], history.history['loss'][0]

                loss_train_history.append(loss)
                loss_val_history.append(val_loss)

                truth = self.model.validation_data[3]
                truth = dl.onehot2list(truth)
                batch_prediction = self.predict_classes(self.model.validation_data[0:3])

                batch_f1 = metrics.f1_score(truth, batch_prediction)
                batch_recall = metrics.recall_score(truth, batch_prediction)
                batch_precision = metrics.precision_score(truth, batch_prediction)

                batch_f1_history.append(batch_f1)
                batch_recall_history.append(batch_recall)
                batch_precision_history.append(batch_precision)

            batch_history['f1'].append(batch_f1_history)
            batch_history['recall'].append(batch_recall_history)
            batch_history['precision'].append(batch_precision_history)

            print('Epoch: {} | Train loss: {} | Valid loss: {}'.format(epoch, loss, val_loss))
            print("Epoch Metrics | F1: {} | Recall {} | Precision: {}".format(np.mean(batch_history['f1'][epoch - 1]),
                                                                              np.mean(batch_history['recall'][epoch - 1]),
                                                                              np.mean(batch_history['precision'][epoch - 1])))
            a_max = np.argmax(batch_history['f1'][epoch - 1])
            print("Best F1 at Epoch {} Minibatch {}: {}\n".format(epoch, a_max, batch_history['f1'][epoch-1][a_max]))


        if save_model:
            self.model.save_weights(self.model_name + '.h5', overwrite=True)
예제 #45
0
def main():
    st = time.time()
    # training parameter
    result_path = 'results/PB4_spam_polluted_missing_NB_Bern.acc'
    model_name = 'spam_'
    mean_path = 'data/spam_polluted_missing/train/f_mean.pickle'
    train_data_path = 'data/spam_polluted_missing/train/data.pickle'
    test_data_path = 'data/spam_polluted_missing/test/data.pickle'

    # laod and preprocess training data
    tr_data = loader.load_pickle_file(train_data_path)
    te_data = loader.load_pickle_file(test_data_path)
    print('{:.2f} Data loaded!'.format(time.time() - st))

    # load means
    means = loader.load_pickle_file(mean_path)
    print('{:.2f} Means loaded!'.format(time.time() - st))

    # start training
    roc = []
    auc = 0.0

    tr_n, f_d = np.shape(tr_data[0])
    te_n, = np.shape(te_data[1])
    te_auc = 2.
    round = 0
    model = m.NBBernoulli(means)
    model.build(tr_data[0], tr_data[1])

    training_acc = model.test(tr_data[0], tr_data[1], util.acc)
    # training_cms.append(training_test_res[1])
    testing_acc = model.test(te_data[0], te_data[1], util.acc)
    # testing_cms.append(testing_test_res[1])


    print('Final results. Train acc: {}, Test acc: {}'.format(training_acc, testing_acc))

    result = {}
    result['TrainingAcc'] = training_acc
    result['TestingAcc'] = testing_acc

    # log the training result to file
    util.write_result_to_file(result_path, model_name, result, True)
예제 #46
0
def ecoc_test():
    svms = loader.load_pickle_file(model_path)
    te_data= loader.load_pickle_file(te_data_path)
    pred = []

    for f in te_data[0]:
        min_hamming_dist = 1.
        match_label = 0
        code = []
        for s in svms:
            c_pred = s.predict([f])[0]
            code.append(1 if c_pred == 1 else 0)  # replace -1 with 0
        for ind, c in enumerate(ecoc):
            cur_hd = hamming(c, code)
            if cur_hd < min_hamming_dist:
                min_hamming_dist = cur_hd
                match_label = ind
        pred.append(match_label)

    return (pred == te_data[1]).sum() / len(te_data[1])
예제 #47
0
def abstract_features(data_path, cs_path, rects_path, res_path):
    # get cs
    cs = get_cs(data_path, cs_path)
    rects = loader.load_pickle_file(rects_path)

    # 2 features for each rectangle
    features = []
    for i, ccs in enumerate(cs):
        f = []
        for rect in rects:
            f.extend(compute_feature_with_cs(rect, ccs))
        features.append(f)
        print('{} rects finished.'.format(i))

    # combine with labels
    label = loader.load_pickle_file(data_path)[1]
    f_l = [np.array(features), label]
    loader.save(res_path, f_l)

    return f_l
def create_filename(directory, num=1) :
    """
    creates basic name for a file containing condition and distance information.
    Use num to assign numbers if more than one file per distance will be created
    """
    file = directory + '/info.dat'
    info = dl.load(file)[0]
    distancestr = info['Distance']
    conditionstr= info['Condition']
    speciesName = 'Pholidoptera_littoralis_'
    return speciesName+conditionstr+'_'+distancestr+'_'+str(num)
예제 #49
0
파일: Utilities.py 프로젝트: Juncai/CS6140
def get_ecoc(ecoc_path, num_ecoc, class_num):
    if path.isfile(ecoc_path):
        print('Loading the ecoc...')
        best_ecoc = loader.load_pickle_file(ecoc_path)
    else:
        print('Creating the ecoc...')
        best_ecoc = [0, [], []]     # distance, ecoc for training, ecoc for predicting
        for i in range(100):
            n = int(math.pow(2, num_ecoc))
            codes = choice(n, class_num)
            ecoc_func_codes = []
            for i in range(num_ecoc):
                ecoc_func_codes.append([])
            c_ecoc = []
            for c in codes:
                bin_s = '{0:0' + str(num_ecoc) + '10b}'.format(c)
                bin_s = [int(ss) for ss in bin_s]
                c_ecoc.append(bin_s)
                for i in range(num_ecoc):
                    ecoc_func_codes[i].append(bin_s[i])
            c_hamming_dist = 0
            has_same_code = False
            for j in range(len(c_ecoc)):
                for k in range(len(c_ecoc)):
                    if j != k:
                        c_hd = hamming(c_ecoc[j], c_ecoc[k])
                        if c_hd == 0:
                            has_same_code = True
                        c_hamming_dist += c_hd
            if has_same_code:
                continue
            if c_hamming_dist > best_ecoc[0]:
                best_ecoc[0] = c_hamming_dist
                best_ecoc[1] = ecoc_func_codes
                best_ecoc[2] = c_ecoc

        # serialize the best ecoc
        loader.save(ecoc_path, best_ecoc)
    return best_ecoc
예제 #50
0
def random_select_rectangle(h, w, n, pl, ph, save_path=None):
    '''

    :param h: height of the image in pixel
    :param w: width of the image in pixel
    :param n: number of rectangle
    :param pl: min pixels of each rectangle
    :param ph: max pixels of each rectangle
    :return:
    '''
    sel_rects = []
    for i in range(n):
        a = -1
        while a < pl or a > ph:
            p1 = (random.randint(0, h - 1), random.randint(0, w - 1))
            p2 = (random.randint(0, h - 1), random.randint(0, w - 1))
            a = rect_area(p1, p2)
        sel_rects.append(((min(p1[0], p2[0]), min(p1[1], p2[1])), (max(p1[0], p2[0]), max(p1[1], p2[1]))))

    if save_path is not None:
        loader.save(save_path, sel_rects)

    return sel_rects
예제 #51
0
파일: svm.py 프로젝트: hariravi/KaggleMLYH
def predict_main(classifier_pickle):
    data = DataLoader.load_kaggle_mnist("mnist_train.csv", neural=False)
    X = numpy.array(data[2][0])
    X = X/255.0*2 - 1
    Y = numpy.array(data[2][1])
    predictor = MLutil.Predictor(classifier_pickle, 'SVM')
    predicted_values = predictor.make_prediction(X)

    predAnalysis = MLutil.PredictionAccuracies(predicted_values, Y)
    print(predAnalysis.get_misclass_rate())
    print(predAnalysis.get_indicies_misclassifications())

    pickle.dump(predAnalysis.get_indicies_misclassifications(), open("svm_indicies.p", "wb"))
    return predAnalysis.get_indicies_misclassifications()
예제 #52
0
def ecoc():

    # training parameter
    c = 0.001
    tol = 0.01
    epsilon = 0.001
    # kernel = 'rbf'
    kernel = 'linear'

    # laod and preprocess training data
    print('Loading data...')
    tr_data = loader.load_pickle_file(tr_data_path)
    te_data= loader.load_pickle_file(te_data_path)

    # randomly generate ECOC of 50 functions
    num_ecoc = 10
    class_num = 10
    best_ecoc = util.get_ecoc(ecoc_path, num_ecoc, class_num)

    # train 10 svm
    print('Begin training...')
    svms = []  # list of svm classifiers
    function_tr_err = []
    sst = time.time()
    for ind, c_ecoc in enumerate(best_ecoc[1]):
        st = time.time()
        # prepare label
        c_label = [-1 if c_ecoc[l] == 0 else 1 for l in tr_data[1]]
        clf = svm.SVM(C=c, tol=tol, epsilon=epsilon, kernel=kernel)
        clf.fit(tr_data[0], c_label)
        tr_pred = clf.predict(tr_data)
        tr_acc = (c_label == tr_pred).sum() / tr_data[0].shape[0]
        print('{} Function {} done. Final results. Train acc: {}'.format(time.time() - st, ind, tr_acc))
        svms.append(clf)

    print('{} Training finished.'.format(time.time() - sst))
    loader.save(model_path, svms)
예제 #53
0
def main():
    # training parameter
    result_path = 'results/PB1_B_digits.acc'
    model_name = 'digits_'
    threshes_path = 'data/spambase.threshes'
    tr_data_path = 'data\\digits\\tr_f_l_10r.pickle'
    te_data_path = 'data\\digits\\te_f_l_10r.pickle'
    # laod and preprocess training data
    tr_data = loader.load_pickle_file(tr_data_path)
    te_data = loader.load_pickle_file(te_data_path)

    # transpose label
    tr_data[1] = np.transpose(tr_data[1])[0]
    te_data[1] = np.transpose(te_data[1])[0]

    # start training
    # kernel = 'poly'
    kernel = 'linear'
    tol = 0.01
    c = 0.01

    st = time.time()

    # start training
    print('{} Start training. Kernel: {}'.format(time.time() - st, kernel))
    # clf = svm.SVC(kernel='poly')
    clf = svm.SVC(C=c, kernel=kernel, tol=tol)
    # clf = svm.NuSVC(kernel=kernel)
    clf.fit(tr_data[0], tr_data[1])
    tr_pred = clf.predict(tr_data[0])
    te_pred = clf.predict(te_data[0])

    tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
    te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

    print('{} Final results. Train acc: {}, Test acc: {}'.format(time.time() - st, tr_acc, te_acc))
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], '', ['max_words_abstract=', 'max_words_title=', 'max_words_mesh=',
                                                      'path=', 'w2v_path=', 'w2v_length=', '=filter_small_data'])

    except getopt.GetoptError as error:
        print(error)
        sys.exit(2)

    max_words = {'text': 270, 'mesh': 50, 'title': 17}
    path = 'Data/'
    w2v_path = '/Users/ericrincon/PycharmProjects/Deep-PICO/wikipedia-pubmed-and-PMC-w2v.bin'
    word_vector_size = 200
    filter_small_data = False

    for opt, arg in opts:
        if opt == '--max_words_abstract':
            max_words['text'] = int(arg)
        elif opt == '--max_words_title':
            max_words['mesh'] = int(arg)
        elif opt == '--max_words_mesh':
            max_words['mesh'] = int(arg)
        elif opt == '--path':
            path = arg
        elif opt == '--w2v_path':
            w2v_path = arg
        elif opt == '--filter_small_data':
            if int(arg) == 1:
                filter_small_data = True
            elif int(arg):
                filter_small_data = False


    print('Loading word2vec...')
    w2v = Word2Vec.load_word2vec_format(w2v_path, binary=True)
    print('Loaded word2vec...')

    X_list, y_list, data_names = DataLoader.get_data_separately(max_words, word_vector_size, w2v, use_abstract_cnn=True,
                                                        preprocess_text=False, filter_small_data=filter_small_data)

    for X, y, name in zip(X_list, y_list, data_names):
        X_abstract, X_title, X_mesh = X

        f = h5py.File("DataProcessed/" + name + ".hdf5", "w")
        f.create_dataset('X_abstract', data=X_abstract, shape=X_abstract.shape)
        f.create_dataset('X_title', data=X_title, shape=X_title.shape)
        f.create_dataset('X_mesh', data=X_mesh, shape=X_mesh.shape)
        f.create_dataset('y', data=y, shape=y.shape)
예제 #55
0
def predict_main(classifier_pickle):
    print("This functions is being called")
    datasets = DataLoader.load_kaggle_mnist("mnist_train.csv")
    test_set_x, test_set_y = datasets[2]
    test_set_x = test_set_x.get_value()
    test_set_y = test_set_y.eval()

    predictor = MLutil.Predictor(classifier_pickle, 'DNN')
    predicted_values = predictor.make_prediction(test_set_x)

    predAnalysis = MLutil.PredictionAccuracies(predicted_values, test_set_y)
    print(predAnalysis.get_misclass_rate())
    print(predAnalysis.get_indicies_misclassifications())

    pickle.dump(predAnalysis.get_indicies_misclassifications(), open("neural_indicies.p", "wb"))
    return predAnalysis.get_indicies_misclassifications()
예제 #56
0
파일: svm.py 프로젝트: hariravi/KaggleMLYH
def svm_main(dataset, pickle_model):
    data = DataLoader.load_kaggle_mnist(dataset, neural=False)
    classifier = SVM()
    start = time.time()
    print("Fitting the svm")
    X = numpy.array(data[0][0])
    X = X/255.0*2 - 1
    print(X)
    Y = numpy.array(data[0][1])
    print(len(X))
    print(len(Y))
    del data
    classifier.fit_multi(X, Y)
    fin = time.time() - start
    print("Awesome, the SVM has been fit, only took {0} seconds".format(fin))
    pickle.dump(classifier, open(pickle_model, "wb"))
예제 #57
0
    def train(self, X, y, n_epochs, optim_algo='adam', criterion='categorical_crossentropy', save_model=True,
              verbose=2, plot=True, batch_size=64, fold_idxs=None):

        if optim_algo == 'adam':
            optim_algo = Adam()
        elif optim_algo == 'sgd':
            optim_algo = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
        elif optim_algo == 'adagrad':
            optim_algo = Adagrad()

        self.model.compile(optimizer=optim_algo, loss=criterion)

        if fold_idxs is not None:
            batch_generator = StanderedBG(X, y, batch_size=batch_size, fold_indices=fold_idxs)
        else:
            batch_generator = StanderedBG(X, y, batch_size=batch_size)


        loss_train_history = []
        loss_val_history = []
        batch_history = {'accuracy': []}

        for epoch in range(1, n_epochs + 1):
            batch_accuracy_history = []

            for X, y in batch_generator.next_batch():
                history = self.model.fit(X, y, nb_epoch=1, batch_size=batch_size,
                                         validation_split=0.2, verbose=0)

                val_loss, loss = history.history['val_loss'][0], history.history['loss'][0]

                loss_train_history.append(loss)
                loss_val_history.append(val_loss)

                truth = self.model.validation_data[1]
                truth = dl.onehot2list(truth)
                batch_prediction = self.predict_classes(self.model.validation_data[0])
                accuracy = metrics.accuracy_score(truth, batch_prediction)
                batch_accuracy_history.append(accuracy)

            batch_history['accuracy'].append(batch_accuracy_history)

            print('Epoch: {} | Train loss: {} | Valid loss: {}'.format(epoch, loss, val_loss))
            print("Epoch Metrics | Accuracy: {}".format(np.mean(batch_history['accuracy'][epoch-1])))

        if save_model:
            self.model.save_weights(self.model_name + '.h5', overwrite=True)
예제 #58
0
파일: PB5_RELIEF.py 프로젝트: Juncai/CS6140
def main():
    # training parameter
    is_sklearn = True
    k = 10  # fold
    result_path = 'results/PB2_spam.acc'
    model_name = 'spam_' + str(k) + 'fold'
    data_path = 'data/spam/data.pickle'

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    # util.replace_zero_label_with_neg_one(training_data)

    # Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])
    # training_data[0] = preprocessing.scale(training_data[0])


    # start training
    training_errs = []
    testing_errs = []
    print('Preparing k fold data.')
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in (0,):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)

        # start training
        print('{:.2f} Start training.'.format(time.time() - st))
        kernel = c.EUCLIDEAN
        # kernel = c.GAUSSIAN
        f_select = True
        best_features_num = 5
        clf = kNN.kNN(kernel=kernel)
        clf.fit(tr_data[0], tr_data[1], f_select=f_select, best_f=best_features_num)
        print("Best features: {}".format(clf.best_f_indices))
        for kk in (1, 2, 3, 7):
            tr_pred = clf.predict(tr_data[0], k=kk)
            te_pred = clf.predict(te_data[0], k=kk)

            tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
            te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

            print('{} Final results with kernel {}, k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, kk, tr_acc, te_acc))
예제 #59
0
파일: PB2_A_spam.py 프로젝트: Juncai/CS6140
def main():
    # training parameter
    k = 8  # fold
    result_path = 'results/PB2_spam.acc'
    model_name = 'spam_' + str(k) + 'fold'
    data_path = 'data/spam/data.pickle'

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    # util.replace_zero_label_with_neg_one(training_data)

    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])
    # Preprocess.normalize_features_all(Preprocess.shifiat_and_scale, training_data[0])


    # start training
    training_accs = []
    testing_accs = []
    print('Preparing k fold data.')
    k_folds = Preprocess.prepare_k_folds(training_data, k)
    kernel = c.EUCLIDEAN
    sst = time.time()
    for i in (1,):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)

        # start training
        print('{:.2f} Start training.'.format(time.time() - st))
        for r in (2.5, 2.7):
            clf = kNN.kNN(kernel=kernel)
            # clf.fit(training_data[0], training_data[1])
            clf.fit(tr_data[0], tr_data[1])
            # tr_pred = clf.predict(training_data[0], r=r)
            tr_pred = clf.predict(tr_data[0], r=r)
            te_pred = clf.predict(te_data[0], r=r)

            # tr_acc = (training_data[1] == tr_pred).sum() / training_data[0].shape[0]
            tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
            te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

            testing_accs.append(te_acc)
            print('{} {}-fold results with kernel {}, r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, i, kernel, r, tr_acc, te_acc))
예제 #60
0
def main():
    # training parameter
    k = 10  # fold
    result_path = "results/PB1_A_spam.acc"
    model_name = "spam_" + str(k) + "fold"
    threshes_path = "data/spambase.threshes"
    data_path = "data/spam/data.pickle"
    # kernel = 'poly'
    kernel = "linear"
    # kernel = 'rbf'
    verbose = False
    tol = 0.01
    c = 0.1

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    util.replace_zero_label_with_neg_one(training_data)

    # normalize
    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])

    print("Preparing k fold data.")
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in range(1):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)

        # start training
        print("{:3f} Start training. Kernel: {}".format(time.time() - st, kernel))

        clf = svm.SVC(C=c, kernel=kernel, tol=tol, verbose=verbose)
        # clf = svm.NuSVC(kernel=kernel, tol=tol, verbose=verbose)
        clf.fit(tr_data[0], tr_data[1])
        tr_pred = clf.predict(tr_data[0])
        te_pred = clf.predict(te_data[0])

        tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
        te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

        print("{:3f} Final results. Train acc: {}, Test acc: {}".format(time.time() - st, tr_acc, te_acc))