def setUp(self):
     pass
     num_cols = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
     cate_cols = ['workclass', 'education', 'marital_status', 'relationship']
     label_col = 'income_bracket'
     train_arr, train_label_arr = util.load_data(PreprocessingTest.data_file, num_cols, cate_cols, label_col=label_col)
     Preprocessing.X_train, Preprocessing.X_test, Preprocessing.y_train, Preprocessing.y_train = train_test_split(train_arr, train_label_arr, train_size=0.7)
Exemple #2
0
def eval_svm():
    data, rawData = load_data()
    col1Data, col1Name = extract_column(rawData, 2)

    col0Data, col0Name = extract_column(rawData, 0)
    col1Data, col1Name = extract_column(rawData, 1)
    col2Data, col2Name = extract_column(rawData, 2)
    col3Data, col3Name = extract_column(rawData, 3)

    # split dataset
    # X, y = col1Data, data[:, -1]

    X = np.column_stack((col0Data, col1Data, col2Data, col3Data))
    y = data[:, -1]

    trainX, testX, trainy, testy = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    random_state=1)

    svclassifier = SVC(kernel='rbf', C=2)
    svclassifier.fit(trainX, trainy)

    y_pred = svclassifier.predict(testX)

    print(confusion_matrix(testy, y_pred))
    print(classification_report(testy, y_pred))
Exemple #3
0
def wavelet_trans():
    data, rawData = load_data()

    col0Data, col0Name = extract_column(rawData, 0)
    col1Data, col1Name = extract_column(rawData, 1)
    col2Data, col2Name = extract_column(rawData, 2)
    col3Data, col3Name = extract_column(rawData, 3)
    col4Data, col3Name = extract_column(rawData, 4)
    col5Data, col3Name = extract_column(rawData, 5)
    col6Data, col3Name = extract_column(rawData, 6)
    col7Data, col3Name = extract_column(rawData, 7)
    col8Data, col3Name = extract_column(rawData, 8)
    col9Data, col3Name = extract_column(rawData, 9)
    col10Data, col3Name = extract_column(rawData, 10)
    col11Data, col3Name = extract_column(rawData, 11)
    col12Data, col3Name = extract_column(rawData, 12)
    col13Data, col3Name = extract_column(rawData, 13)

    X = np.column_stack((col0Data, col1Data, col2Data, col3Data, col4Data,
                         col5Data, col6Data, col7Data, col8Data, col9Data,
                         col10Data, col11Data, col12Data, col13Data))
    y = data[:, -1]

    X_window = np.reshape(X[:-4], (1872, 8, X.shape[1]))

    kfold = KFold(n_splits=5, random_state=1, shuffle=False)

    result = np.array([])
    for train_index, test_index in kfold.split(X_window):
        print('TRAIN:' + str(train_index) + 'TEST:' + str(test_index))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train_ucihar, Y_train_ucihar = get_transformed_features(
            X_train, y_train, 'rbio3.1')
        X_test_ucihar, Y_test_ucihar = get_transformed_features(
            X_test, y_test, 'rbio3.1')

        cls = GradientBoostingClassifier(n_estimators=500,
                                         learning_rate=0.125,
                                         min_samples_split=1000,
                                         min_samples_leaf=1,
                                         max_depth=5)
        cls.fit(X_train_ucihar, Y_train_ucihar)

        train_score = cls.score(X_train_ucihar, Y_train_ucihar)
        test_score = cls.score(X_test_ucihar, Y_test_ucihar)
        print(
            "Train Score for the ECG dataset is about: {}".format(train_score))
        print(str(test_score))
        result = np.append(result, test_score)

    print("Overall results")
    print("Mean:" + str(np.mean(result)))
    print("Median:" + str(np.median(result)))
    print("Min:" + str(np.min(result)) + " , max:" + str(np.max(result)))
Exemple #4
0
def create_calls_df(test=False):
    """
    Creates a dataframe containing all the information from the calls input file required for the
    final report ordered by call date.

    :param: Whether to run with test data

    :return: Pandas dataframe containing id, short date, number, operator-prefix and processed risk score
    """
    calls_df = pd.DataFrame([
        Call(**call_dict).get_report_dict()
        for call_dict in load_data('calls.json', test=test)
    ])
    return calls_df.sort_values(by='date')
Exemple #5
0
def main():
    from sys import argv
    argv = argv[1:]
    key = argv[0]
    #print(argv)
    assert key[0] == "-"
    if key == "-r":
        print("running regularization experiment")
        # regularization experiment
        mus_x_train, rec_x_train, core_train_features, y_train = util.load_data()
        run_reg_experiment(mus_x_train, rec_x_train, core_train_features, y_train)
    elif key == "-l":
        print("running linear model experiment")
        # linear model experiment
        n_features = int(argv[1])
        mus_x_train, rec_x_train, core_train_features, y_train = util.load_data(core_input_shape=n_features)
        run_linear(core_train_features, y_train, input_shape=n_features)
    elif key == "-n":
        print("running neural network experiment")
        # neural network
        mus_x_train, rec_x_train, core_train_features, y_train = util.load_data()
        run(mus_x_train, rec_x_train, core_train_features, y_train)
    else:
        print("Error, unrecognized case")
Exemple #6
0
 def test_load_data(self):
     num_cols = [
         'age', 'education_num', 'capital_gain', 'capital_loss',
         'hours_per_week'
     ]
     cate_cols = [
         'workclass', 'education', 'marital_status', 'relationship'
     ]
     label_col = 'income_bracket'
     data_arr, labels_arr = util.load_data(UtilTest.data_file,
                                           num_cols,
                                           cate_cols,
                                           label_col=label_col)
     print("test load_data =========================")
     print('data_arr => %s' % str(data_arr.shape))
     print('labels_arr => %s' % str(labels_arr.shape))
Exemple #7
0
def create_operator_lookup(test=False):
    """
    Generates a lookup for operators based on prefix by reading from the supplied operators file.

    :param: Whether to run with test data

    :return: A dictionary of operator code to operator
    """
    operator_lookup = {'Unknown': 'Unknown'}
    operators = [
        Operator(**op_dict)
        for op_dict in load_data('operators.json', test=test)
    ]
    for op in operators:
        operator_lookup[op.attributes.prefix] = op.attributes.operator
    return operator_lookup
Exemple #8
0
def run_dfs(start_city, finish_city, n_paths):
    network, index_to_name = load_data()
    start = next(key for key, value in index_to_name.items()
                 if value == start_city)
    finish = next(key for key, value in index_to_name.items()
                  if value == finish_city)
    best_distances, best_paths = network.dfs_solve(start,
                                                   finish,
                                                   n_routes=n_paths)

    for i, path in enumerate(best_paths):
        print(f"{i + 1}. PATH DFS: ")
        for city in path:
            print("\t" + str(index_to_name[city]))
        print(f"\t\tLength = {best_distances[i]}")
    return best_distances
Exemple #9
0
def test_kmeans():
    plt.figure('iris clustering')
    X = load_data('data/iris.data')
    iris = PCA(n_components=2).fit_transform(X)

    y_pred = kmeans(X, 3)
    acc = accuracy(y_pred)
    plt.subplot(2, 1, 1)
    plt.scatter(iris[:, 0], iris[:, 1], c=y_pred)
    plt.title('My Kmeans:' + str(acc)[:5])

    y_pred = KMeans(n_clusters=3).fit_predict(X)
    acc = accuracy(y_pred)
    plt.subplot(2, 1, 2)
    plt.scatter(iris[:, 0], iris[:, 1], c=y_pred)
    plt.title('Sklearn Kmeans:' + str(acc)[:5])

    plt.tight_layout()
    plt.show()
def speed_test():
    arguments = parse_args()
    pairs = []
    problem, index_to_name = load_data()
    for i in range(100):
        a = random.randint(0, len(index_to_name) - 1)
        b = random.randint(0, len(index_to_name) - 1)
        while b is a:
            b = random.randint(0, len(index_to_name) - 1)
        pairs.append((a, b))
    bench_results_file = open("results.txt", 'w')
    for ants in range(1, 30):
        for iters in range(1, 30):
            dfs_faster = 0
            aco_faster = 0
            aco_worse_best = 0
            aco_worse_nth = 0
            for pair in pairs:
                arguments['starting'] = index_to_name[pair[0]]
                arguments['finishing'] = index_to_name[pair[1]]
                arguments['ants'] = ants
                arguments['iterations'] = iters
                a = datetime.datetime.now()
                dfs_dis = run_dfs(arguments['starting'], arguments['finishing'], arguments['npaths'])
                b = datetime.datetime.now()
                delta = b - a
                time, aco_dist = run_aco(arguments)
                if not aco_dist or int(aco_dist[0] - dfs_dis[0]) > 0:
                    aco_worse_best += 1
                if not aco_dist or int(aco_dist[-1] - dfs_dis[len(aco_dist) - 1]) > 0:
                    aco_worse_nth += 1
                if int(delta.total_seconds() * 1000) > int(time):
                    aco_faster += 1
                else:
                    dfs_faster += 1

            bench_results_file.write(f"ants: {ants} iters: {iters} \n")
            bench_results_file.write(f" acobestworse: {aco_worse_best}")
            bench_results_file.write(f" aconthworse: {aco_worse_nth}")
            bench_results_file.write(f" acofaster: {aco_faster}")
            bench_results_file.write(f" dfsfaster: {dfs_faster} \n")
    bench_results_file.close()
Exemple #11
0
def test_gaussian():
    plt.figure('iris clustering')
    X = load_data('data/iris.data')
    iris = PCA(n_components=2).fit_transform(X)

    acc1 = []
    acc2 = []
    sig2 = 0.5
    for i in range(39):
        A = affinity_matrix(X, 'gaussian', 1, sig2)
        L = laplacian(A, std=True)
        spectral_ft = spectral_data(L, 3)
        #y_pred = kmeans(spectral_ft, 3)
        y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft)
        acc = accuracy(y_pred)
        acc1.append(acc)

        L = laplacian(A, std=False)
        spectral_ft = spectral_data(L, 3)
        #y_pred = kmeans(spectral_ft, 3)
        y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft)
        acc = accuracy(y_pred)
        acc2.append(acc)

        sig2 += 0.5

    plt.subplot(2, 1, 1)
    plt.plot([x / 10 for x in range(5, 200, 5)], acc1)
    plt.xlabel('sqrt(sig)')
    plt.ylabel('accuracy rate')
    plt.title('Gaussian with std Lapalacian')

    plt.subplot(2, 1, 2)
    plt.plot([x / 10 for x in range(5, 200, 5)], acc2)
    plt.xlabel('sqrt(sig)')
    plt.ylabel('accuracy rate')
    plt.title('Gaussian with Lapalacian')
    plt.tight_layout()
    plt.show()
Exemple #12
0
def run_aco(arguments: vars):
    problem, index_to_name = load_data()
    optimizer = AntColonyOptimizer(n_ants=arguments['ants'],
                                   rho=arguments['rho'],
                                   pheromone_unit=arguments['pheromone'],
                                   elitist_weight=arguments['elitist'],
                                   distance_preference_factor=100)
    start = next(key for key, value in index_to_name.items()
                 if value == arguments['starting'])
    finish = next(key for key, value in index_to_name.items()
                  if value == arguments['finishing'])
    best_distances, best_paths, time = optimizer.fit(
        problem,
        start,
        finish,
        iterations=arguments['iterations'],
        n_paths=arguments['npaths'])

    for i, path in enumerate(best_paths):
        print(f"{i + 1}. PATH ACO: ")
        for city in path:
            print("\t" + str(index_to_name[city]))
        print(f"\t\tLength = {best_distances[i]}")
    return time, best_distances
    warnings.filterwarnings("ignore", category=matplotlib.cbook.mplDeprecation)

    # Load real data or make synthetic data (long way so that order is preserved)
    imei = "861508033133471"
    dates = OrderedDict()
    dates["3A"] = ("2017-03-25", "2017-03-25")
    dates["2.5A"] = ("2017-03-26", "2017-03-26")
    dates["2A"] = ("2017-03-27", "2017-03-27")
    dates["1.5A"] = ("2017-03-28", "2017-03-29")
    # dates['1A (0)'] = ('2017-03-30','2017-03-31')
    dates["1A"] = ("2017-03-31", "2017-04-01")
    dates["0.5A"] = ("2017-04-02", "2017-04-04")

    datas = load_data("March17",
                      imei=imei,
                      dates=dates,
                      wt=10,
                      ignore_start=True)
    # imei = 'batt1_run3_C01'
    # imei = 'batt2_run2_C01'
    # datas = load_data('Nov17', imei=imei)

    # Choose least squares solver and model solver
    lsq_solver = ["dfogn", "dfols", "scipy", None][int(
        input("Choose lsq solver (0: dfogn, 1: dfols, 2: scipy.opt, 3: None): "
              ))]
    model_solver = [
        Numerical,
        LeadingOrderQuasiStatic,
        FirstOrderQuasiStatic,
        Composite,
Exemple #14
0
    logger.info("{} runs detected".format(len(runs)))

    for combination in runs:

        config = DefaultConfig()
        config.dataset = args.dataset
        config.model_name = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(12)) + '.model'
        for attr, value in combination:
            setattr(config, attr, value)

        if config.dataset == 'softmax':
            data = util.load_data('',
                                  time_step,
                                  config.time_batch_len,
                                  config.max_time_batches,
                                  nottingham=pickle)
            config.input_dim = data["input_dim"]
        else:
            raise Exception("Other datasets not yet implemented")

        logger.info(config)
        config_file_path = os.path.join(run_folder,
                                        get_config_name(config) + '.config')
        with open(config_file_path, 'wb') as f:
            cPickle.dump(config, f)

        with tf.Graph().as_default(), tf.Session() as session:
            with tf.variable_scope("model", reuse=None):
                train_model = model_class(config, training=True)
Exemple #15
0
 def setUpClass(cls):
     test_data = load_data('calls.json', test=True)
     cls.calls = [Call(**call_dict) for call_dict in test_data]
def benchmark(arguments: vars):
    # losujemy 20 roznych polaczen
    problem, index_to_name = load_data()
    pairs = set()
    while len(pairs) < 20:
        a = random.randint(0, len(index_to_name) - 1)
        b = random.randint(0, len(index_to_name) - 1)
        while b is a:
            b = random.randint(0, len(index_to_name) - 1)
        pairs.add((a, b))

    best_routes = []
    # wykonaj dla kazdego dfs zeby miec porownanie
    for a, b in pairs:
        dis = run_dfs(index_to_name[a], index_to_name[b], 5)
        best_routes.append(dis)

        # takie znaleziono najlepsze:
    for i, (a, b) in enumerate(pairs):
        print(index_to_name[a] + " -> " + index_to_name[b] + " dlugosc: " + str(best_routes[i]))

    bench_rho_value = np.arange(0.05, 0.20, 0.02)
    bench_pherom_count = np.arange(50, 300, 50)
    bench_elitist = np.arange(1, 5, 0.3)

    print("TAKIE SA MIASTA:\n")
    for a, b in pairs:
        print(str(index_to_name[a]) + " -> " + str(index_to_name[b]))

    bench_results_file = open("results.txt", 'a')
    bench_results_file.write("Mrowki\tIteracje\tRHO\tFeromony\tpolepszanie najlepszej\troznica_srednia\n")
    bench_results_file.close()
    curr_best = float("inf")
    best_settings = ""
    for rho_v in bench_rho_value:  # 6
        settings = ""
        for pherom_c in bench_pherom_count:  # 3
            for elits in bench_elitist:  # 6
                arguments['rho'] = rho_v
                arguments['pheromone'] = pherom_c
                arguments['elitist'] = elits
                differences = dict()
                for i, (a, b) in enumerate(pairs):  # 20
                    tmp = 0.0
                    for repeat in range(10):  # 5
                        time, dis = run_aco(arguments)
                        tmp += dis[min(4, len(dis) - 1)] - best_routes[i][min(4, len(dis) - 1)]
                    differences[i] = tmp / 10  # jaka srednio wychodzi roznica

                average = sum(differences.values()) / len(differences.values())
                # print(str(average))
                if (average < curr_best):
                    curr_best = average
                settings = f'15\t15\t{rho_v}\t{pherom_c}\t{elits}\t{average}\n'
                if (average < curr_best):
                    curr_best = average
                    best_settings = settings
                bench_results_file = open("results.txt", 'a')
                bench_results_file.write(settings)
                bench_results_file.close()
    print(f'\n\nBest settings = {best_settings}')
Exemple #17
0
# this is mainly to reproduce the findings of the originial authors(Rösler et al.) of the dataset
# as well as the findings of: https://machinelearningmastery.com/how-to-predict-whether-eyes-are-open-or-closed-using-brain-waves/
# who showed that this result is invalid due to the test methology

import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from src.util import extract_column, load_data

data, rawData = load_data()

col1Data, col1Name = extract_column(rawData, 2)
col2Data, col2Name = extract_column(rawData, 3)

# plot autocorrelation
autocorrelation_plot(col1Data)

# split dataset
X, y = col1Data, data[:, -1]
# X = np.column_stack((col1Data,col2Data))
X, y = data[:, :-1], data[:, -1]
trainX, testX, trainy, testy = train_test_split(X,
                                                y,
                                                test_size=0.1,
                                                shuffle=True,
Exemple #18
0
            for i, c in enumerate(self.clustering) if relevance[i]
        ])
        self.irrelevant.extend([
            ClusterTreeIrrNode(c, rel=False, k=self.k)
            for i, c in enumerate(self.clustering) if not relevance[i]
        ])


class ClusterTreeFNNode(ClusterTreeNode):
    def __init__(self, data, rel=True, k=4):
        super(ClusterTreeFNNode, self).__init__(data, rel, k)

    def sample(self):  # sample around centroid and expand the boundaries
        pass


def query(data):
    return (data[:, 0] > 39) & (data[:, 0] < 77) & (data[:, 1] >
                                                    25) & (data[:, 1] < 56)


if __name__ == '__main__':
    data_path = "../data/sdss_100k.csv.gz"
    columns = ['rowc', 'colc', 'ra', 'field', 'fieldid', 'dec']
    data = np.array(load_data(data_path, columns))
    data = (data - data.min(axis=0)) / (data.max(axis=0) -
                                        data.min(axis=0)) * 100

    ground_truth = data[query(data)]
    root = ClusterTreeNode(data, k=4)
Exemple #19
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from src import util
from src import eval

# ts, data = util.load_data("../data/NSW2013.csv", columnName="TOTALDEMAND")
# ts, data = util.load_data("../data/bike_hour.csv", columnName="cnt")
# ts, data = util.load_data("../data/TAS2016.csv", columnName="TOTALDEMAND")
# ts, data = util.load_data("../data/traffic_data_in_bits.csv", columnName="value")
# ts, data = util.load_data("../data/beijing_pm25.csv", columnName="pm2.5")
ts, data = util.load_data("../data/pollution.csv", columnName="Ozone")

train, test = util.divideTrainTest(data)
print("train shape is", train.shape)
print("test shape is", test.shape)

flag = False
lag = 48
h_test = 6
trainX, trainY = util.create_multi_ahead_samples(train, lag, h_test, RNN=flag)
testX, testY = util.create_multi_ahead_samples(test, lag, h_test, RNN=flag)
print("testX shape:", testX.shape)
print("testy shape:", testY.shape)
print("trainX shape:", trainX.shape)
print("trainy shape:", trainY.shape)

groud_truth = []
prediction = []
for i in range(len(testX)):
 def setUpClass(cls):
     test_data = load_data('operators.json', test=True)
     cls.operators = [Operator(**op_dict) for op_dict in test_data]
Exemple #21
0
                                                mlp_dropout
                                                )

model_dir = os.path.join(arg.model_dir, dt_str)
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)
model_name = os.path.join(model_dir, exp_stamp + ".model.json")
model_weights_name = os.path.join(model_dir, exp_stamp + ".weight.h5")
model_metrics_name = os.path.join(model_dir, exp_stamp + ".metrics.json")

tf_log_dir = os.path.join(tensorboard_log_dir, exp_stamp)
if not os.path.isdir(tf_log_dir):
    os.mkdir(tf_log_dir)

# =====data preprocess=====
X_train, y_train, X_dev, y_dev, X_test, y_test, tokenizer = load_data(train_sampling=arg.train_sampling)

# =====preapare embedding matrix=====
word_index = tokenizer.word_index
num_words = len(word_index)

embeddings_index = load_embedding_index(arg.embedding_dir, arg.embedding_file)

embedding_matrix = np.zeros((len(word_index) + 1, 200))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Exemple #22
0
def parameter_fit():
    imei = "861508033133471"
    dates = OrderedDict()
    dates["3A"] = ("2017-03-25", "2017-03-25")
    dates["2.5A"] = ("2017-03-26", "2017-03-26")
    dates["2A"] = ("2017-03-27", "2017-03-27")
    dates["1.5A"] = ("2017-03-28", "2017-03-29")
    # dates['1A (0)'] = ('2017-03-30','2017-03-31')
    dates["1A"] = ("2017-03-31", "2017-04-01")
    dates["0.5A"] = ("2017-04-02", "2017-04-04")

    datas = load_data("March17",
                      imei=imei,
                      dates=dates,
                      wt=10,
                      ignore_start=True)

    # Load existing fits or initiate
    try:
        fits = np.load("out/data/fits/dict_all.npy")[np.newaxis][0]
    except FileNotFoundError:
        fits = defaultdict(dict)
    # Choose least squares solver and model solver
    for lsq_solver in ["scipy", "dfogn"]:
        for model_solver in [
                LeadingOrderQuasiStatic,
                FirstOrderQuasiStatic,
                Composite,
                Numerical,
        ]:
            print("-" * 60)
            # Define the starting point
            x0 = np.concatenate(
                [np.array([0.6, 0.9, 0.08, 1 / 6]),
                 np.ones(len(datas) - 1)])

            fits[lsq_solver][model_solver] = do_parameter_fit(
                x0, datas, lsq_solver, model_solver)  # (x, f, soln_time)
            np.save(
                "out/data/fits/dict_all.npy".format(lsq_solver,
                                                    model_solver.__name__),
                fits,
            )

            # fig is saved inside parameter_fitting.py

    # Fill tables
    for lsq_solver in ["scipy", "dfogn"]:
        with open("out/tables/fits/{}_performance.txt".format(lsq_solver),
                  "w") as perf_table_row:
            for model_solver in [
                    LeadingOrderQuasiStatic,
                    FirstOrderQuasiStatic,
                    Composite,
                    Numerical,
            ]:
                # Make entries for performance table
                perf_table_row.write("& {1:.2f} & {2:.0f} ".format(
                    *fits[lsq_solver][model_solver]))  # cost and time taken
                # Make entries for parameters table
                with open(
                        "out/tables/fits/{}_{}_params.txt".format(
                            lsq_solver, model_solver.__name__),
                        "w",
                ) as par_table_row:
                    for par in fits[lsq_solver][model_solver][0]:
                        par_table_row.write("& {:.2f}".format(par))
import pickle
import sys
import numpy as np
import pandas as pd
from src.util import load_data, impute_missing_values, encode_features

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

if __name__ == "__main__":
    print("test starting...")
    test = load_data('../input/carInsurance_test.csv')

    # check for null values
    print(test.apply(lambda x: sum(x.isnull()), axis=0))

    test = impute_missing_values(test)
    print(test.apply(lambda x: sum(x.isnull()), axis=0))

    test = encode_features(test)

    result_df: pd.DataFrame = test[['Id']]
    test.drop(['Id', 'CarInsurance'], axis=1, inplace=True)

    model = pickle.load(open('model.pkl', 'rb'))
    predcitions = model.predict(test)
    result_df['CarInsurance'] = np.array(predcitions)
    result_df.to_csv("../test_dataset_prediction_output/result.csv",
                     index=False)
from src import util
from src import eval
import numpy as np
from hmmlearn.hmm import GaussianHMM, GMMHMM

if __name__ == '__main__':

    ts, data = util.load_data("../data/NSW2013.csv", columnName="TOTALDEMAND")
    # ts, data = util.load_data("../data/bike_hour.csv", columnName="cnt")
    # ts, data = util.load_data("../data/TAS2016.csv", columnName="TOTALDEMAND")
    # ts, data = util.load_data("../data/traffic_data_in_bits.csv", columnName="value")
    # ts, data = util.load_data("../data/beijing_pm25.csv", columnName="pm2.5")
    # ts, data = util.load_data("../data/pollution.csv", columnName="Ozone")

    train, test = util.divideTrainTest(data)
    print("train shape is", train.shape)
    print("test shape is", test.shape)
    history = [x[0] for x in train]
    predictions = []
    realTestY = []

    for t in range(len(test)):

        model = GaussianHMM(n_components=2)
        model.fit(train)

        output = model.sample(1)

        yhat = output[0][0]

        predictions.append(yhat)
Exemple #25
0
def test_methods():
    plt.figure('iris clustering')
    X = load_data('data/iris.data')
    iris = PCA(n_components=2).fit_transform(X)

    A = affinity_matrix(X, 'gaussian', 1, 8)
    L = laplacian(A, std=False)
    spectral_ft = spectral_data(L, 3)
    #y_pred = kmeans(spectral_ft, 3)
    y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft)
    acc = accuracy(y_pred)
    plt.subplot(3, 2, 1)
    plt.scatter(iris[:, 0], iris[:, 1], c=y_pred)
    plt.title('Gaussian with Lapalacian:' + str(acc)[:5])

    A = affinity_matrix(X, 'gaussian', 1, 0.5)
    L = laplacian(A, std=True)
    spectral_ft = spectral_data(L, 3)
    #y_pred = kmeans(spectral_ft, 3)
    y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft)
    acc = accuracy(y_pred)
    plt.subplot(3, 2, 2)
    plt.scatter(iris[:, 0], iris[:, 1], c=y_pred)
    plt.title('Gaussian with std Lapalacian' + str(acc)[:5])

    A = affinity_matrix(X, 'eculid')
    L = laplacian(A, std=False)
    spectral_ft = spectral_data(L, 3, min=False)
    #y_pred = kmeans(spectral_ft, 3)
    y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft)
    acc = accuracy(y_pred)
    plt.subplot(3, 2, 3)
    plt.scatter(iris[:, 0], iris[:, 1], c=y_pred)
    plt.title('Eculid with Lapalacian' + str(acc)[:5])

    A = affinity_matrix(X, 'eculid')
    L = laplacian(A, std=True)
    spectral_ft = spectral_data(L, 3, min=False)
    #y_pred = kmeans(spectral_ft, 3)
    y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft)
    acc = accuracy(y_pred)
    plt.subplot(3, 2, 4)
    plt.scatter(iris[:, 0], iris[:, 1], c=y_pred)
    plt.title('Eculid with std Lapalacian' + str(acc)[:5])

    A = affinity_matrix(X, 'cosine')
    L = laplacian(A, std=False)
    spectral_ft = spectral_data(L, 3)
    #y_pred = kmeans(spectral_ft, 3)
    y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft)
    acc = accuracy(y_pred)
    plt.subplot(3, 2, 5)
    plt.scatter(iris[:, 0], iris[:, 1], c=y_pred)
    plt.title('Cosine with Lapalacian' + str(acc)[:5])

    A = affinity_matrix(X, 'cosine')
    L = laplacian(A, std=True)
    spectral_ft = spectral_data(L, 3)
    y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft)
    acc = accuracy(y_pred)
    plt.subplot(3, 2, 6)
    plt.scatter(iris[:, 0], iris[:, 1], c=y_pred)
    plt.title('Cosine with std Lapalacian' + str(acc)[:5])

    plt.tight_layout()
    plt.show()
                        action='store_true',
                        help='Store a plot of the resulting prediction')
    parser.add_argument('--to_csv',
                        action='store_true',
                        help='Store a CSV file of the predictions.')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    set_logging()
    args = parse_args()

    # Load in data
    train_features, train_targets, test_targets = (load_data(
        args.date,
        args.data_dir).pipe(lambda df: split_data(df, args.test_periods)))
    logging.info("Train periods: %s, test periods: %s",
                 train_features.shape[0], test_targets.shape[0])

    # Normalize data
    normalizer = Normalizer()
    train_features = normalizer.fit_transform(train_features)
    train_targets = normalizer.transform(train_targets)
    test_targets = normalizer.transform(test_targets)

    # Format model training input
    columns = train_features.columns.tolist()
    features = dict()
    targets = dict()
    for column in columns:
Exemple #27
0
                  ' elbo {:.2f} | {:.0f} sents/sec |'.format(
                      step, data.shape[0] // BATCH_SIZE,
                      np.mean(log['acc'][-PRINT_EVERY:]),
                      np.mean(log['loss'][-PRINT_EVERY:]),
                      np.mean(log['kl'][-PRINT_EVERY:]),
                      np.mean(log['elbo'][-PRINT_EVERY:]),
                      BATCH_SIZE * PRINT_EVERY / timer.elapsed()))
            write_csv(log, 'log/log.csv')


if __name__ == '__main__':
    torch.manual_seed(42)

    # Load data
    data_path = '../data/eq2_grammar_dataset.h5'
    data = load_data(data_path)
    # Turn it into a float32 PyTorch Tensor
    data = torch.from_numpy(data).float()

    # Create model
    model = GrammarVAE(ENCODER_HIDDEN, Z_SIZE, DECODER_HIDDEN, OUTPUT_SIZE,
                       RNN_TYPE)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    timer = Timer()
    log = {'loss': [], 'kl': [], 'elbo': [], 'acc': []}
    anneal = AnnealKL(step=1e-3, rate=500)

    try:
        for epoch in range(1, EPOCHS + 1):