Example #1
0
def train_and_get_result(_df, _dft,  store_item_nbrs, model, total_features):
	df = _df.copy()
	df_t = _dft.copy()
	RES = []
	total = 0
	for sno, ino in store_item_nbrs:
		if(sno == 35):
			continue
		res = pd.DataFrame()
		df1 = df[(df.store_nbr == sno) & (df.item_nbr == ino)]
		X_train, y_train = ut.get_train_data(df1)
		X_train = X_train.drop(['store_nbr', 'item_nbr'], axis=1)
		y_train = y_train[X_train.index.values]

		df2 = df_t[(df_t.store_nbr == sno) & (df_t.item_nbr == ino)]
		X_predict = ut.get_test_data(df2)
		res['date'] = X_predict['date']
		res['store_nbr'] = X_predict['store_nbr']
		res['item_nbr'] = X_predict['item_nbr']
		X_predict = X_predict.drop(['date', 'store_nbr', 'item_nbr'], axis=1)

		X_train = X_train[ut.get_features()]
		X_predict = X_predict[ut.get_features()]
		regr = ut.get_regression_model(model, len(X_train.values))
		regr.fit(X_train.values.astype(float), y_train.values)
		res['log1p'] = np.maximum(regr.predict(X_predict.values.astype(float)), 0.)
		RES.append(res)
		total += 1
		print('done', total)
	result = pd.concat(RES)
	return result
Example #2
0
def execute(data, training_data_ratio=2.0 / 3.0, k=1):
    """
    Execute the "Locally-Weighted" Linear Regression (using Closed-Form Linear Regression)
    :param data: Raw Data frame parsed from CSV
    :param training_data_ratio: The percent (0.0 to 1.0) of input data to use in training.
    :param k: Smoothing parameter for local weight computation
    :return: Nothing
    """
    # 2. Randomize the data
    randomized_data = util.randomize_data(data)

    # 3. Select the first 2 / 3(round up) of the data for training and the remaining for testing
    training_data, test_data = util.split_data(randomized_data,
                                               training_data_ratio)
    training_outputs = util.get_output(training_data)

    # 4. Standardize the data(except for the last column of course) using the training data
    standardized_training_data, mean, std = util.standardize_data(
        util.get_features(training_data))

    # Add offset column at the front
    standardized_training_data.insert(0, "Bias", 1)

    std_test_data, _, _ = util.standardize_data(util.get_features(test_data),
                                                mean, std)
    std_test_data.insert(0, "Bias", 1)

    squared_errors = []
    # 5. Then for each testing sample
    for i in xrange(0, len(std_test_data)):

        testing_sample = std_test_data.iloc[i]
        expected_output = test_data.loc[testing_sample.name][-1]

        theta_query = compute_theta_query(testing_sample,
                                          standardized_training_data,
                                          training_outputs, k)

        # (b) Evaluate the testing sample using the local model.
        actual_output = np.dot(testing_sample, theta_query)

        # (c) Compute the squared error of the testing sample.
        squared_errors.append(util.compute_se(expected_output, actual_output))

    # 6. Compute the root mean squared error (RMSE)
    sum_of_squared_errors = 0
    for error in squared_errors:
        sum_of_squared_errors += error

    mean_squared_error = sum_of_squared_errors / len(squared_errors)

    rmse = math.sqrt(mean_squared_error)

    return rmse
Example #3
0
def _evaluate (indata):
	if 'clustering_k' in indata:
		first_arg=indata['clustering_k']
	elif 'clustering_merges' in indata:
		first_arg=indata['clustering_merges']
	else:
		return False

	feats=util.get_features(indata, 'distance_')
	dfun=eval(indata['distance_name'])
	distance=dfun(feats['train'], feats['train'])

	cfun=eval(indata['clustering_name'])
	clustering=cfun(first_arg, distance)
	clustering.train()

	if 'clustering_radi' in indata:
		radi=max(abs(clustering.get_radiuses()-indata['clustering_radi']))
		centers=max(abs(clustering.get_cluster_centers().flatten() - \
			indata['clustering_centers'].flat))
		return util.check_accuracy(indata['clustering_accuracy'],
			radi=radi, centers=centers)
	elif 'clustering_merge_distance' in indata:
		merge_distance=max(abs(clustering.get_merge_distances()- \
			indata['clustering_merge_distance']))
		pairs=max(abs(clustering.get_cluster_pairs()- \
			indata['clustering_pairs']).flat)
		return util.check_accuracy(indata['clustering_accuracy'],
			merge_distance=merge_distance, pairs=pairs)
	else:
		return util.check_accuracy(indata['clustering_accuracy'])
Example #4
0
def train_and_get_test(_df, store_item_nbrs, model, total_features):
	df = _df.copy()
	regrs = []
	tests = []
	total = 0
	score_total = []
	for sno, ino in store_item_nbrs:
		if(sno == 35):
			continue
		df1 = df[(df.store_nbr == sno) & (df.item_nbr == ino)]
		df_test, df_train = ut.get_random_test_and_train(df1)
		X_train, y_train = ut.get_train_data(df_train)
		X_train = X_train.drop(['store_nbr', 'item_nbr'], axis=1)
		y_train = y_train[X_train.index.values]

		X_train = X_train[ut.get_features()]
		
		regr = ut.get_regression_model(model, len(X_train))

		# regr.fit(ut.get_processed_X(X_train.values), y_train.values)
		scores = []
		scores = cross_val_score(regr, X_train.values, y_train.values, scoring="mean_squared_error", cv=10)
		print('done, ', total)
		print(-np.mean(scores))
		score_total.append(-np.mean(scores))
		regrs.append(regr)
		tests.append(df_test)
		total += 1
	print('total_score: {}'.format(np.mean(score_total)))

	return regrs, tests
Example #5
0
def _evaluate(indata):
    if indata.has_key('clustering_k'):
        first_arg = indata['clustering_k']
    elif indata.has_key('clustering_merges'):
        first_arg = indata['clustering_merges']
    else:
        return False

    feats = util.get_features(indata, 'distance_')
    dfun = eval(indata['distance_name'])
    distance = dfun(feats['train'], feats['train'])

    cfun = eval(indata['clustering_name'])
    clustering = cfun(first_arg, distance)
    clustering.train()

    if indata.has_key('clustering_radi'):
        radi = max(abs(clustering.get_radiuses() - indata['clustering_radi']))
        centers=max(abs(clustering.get_cluster_centers().flatten() - \
         indata['clustering_centers'].flat))
        return util.check_accuracy(indata['clustering_accuracy'],
                                   radi=radi,
                                   centers=centers)
    elif indata.has_key('clustering_merge_distance'):
        merge_distance=max(abs(clustering.get_merge_distances()- \
         indata['clustering_merge_distance']))
        pairs=max(abs(clustering.get_cluster_pairs()- \
         indata['clustering_pairs']).flat)
        return util.check_accuracy(indata['clustering_accuracy'],
                                   merge_distance=merge_distance,
                                   pairs=pairs)
    else:
        return util.check_accuracy(indata['clustering_accuracy'])
def execute(data, num_folds=5):
    """
    Compute the Root Mean Squared Error using num_folds for cross validation
    :param data: Raw Data frame parsed from CSV
    :param num_folds: The number of folds to use
    :return: Root Mean Squared Error
    """
    assert data is not None, "data must be a valid DataFrame"
    assert num_folds > 1, "num_folds must be greater than one."

    # 2. Randomizes the data
    randomized_data = util.randomize_data(data)

    # 3. Creates S folds (for our purposes S = 5, but make your code generalizable, that is it should
    #   work for any legal value of S)
    folds = divide_data(randomized_data, num_folds)

    squared_errors = []
    # 4. For i = 1 to S
    for i in xrange(0, num_folds):
        #   (a) Select fold i as your testing data and the remaining (S - 1) folds as your training data
        test_data = folds[i]
        training_data = select_training_data(folds, i)

        #   (b) Standardizes the data (except for the last column of course) based on the training data
        standardized_train_data, mean, std = util.standardize_data(
            util.get_features(training_data))

        # Add offset column at the front
        standardized_train_data.insert(0, "Bias", 1)

        #   (c) Train a closed-form linear regression model
        training_outputs = util.get_output(training_data)
        weights = cflr.find_weights(standardized_train_data, training_outputs)

        #   (d) Compute the squared error for each sample in the current testing fold
        expected = util.get_output(test_data)
        actual = cflr.apply_solution(util.get_features(test_data), mean, std,
                                     weights)

        squared_error = (expected - actual)**2
        squared_errors.append(squared_error)

    # 5. Compute the RMSE using all the errors.
    rmse = compute_rmse(len(data), squared_errors)

    return rmse
Example #7
0
def _evaluate(indata):
    prefix = 'classifier_'
    ctype = indata[prefix + 'type']
    if indata[prefix + 'name'] == 'KNN':
        feats = util.get_features(indata, 'distance_')
    elif ctype == 'kernel':
        feats = util.get_features(indata, 'kernel_')
    else:
        feats = util.get_features(indata, prefix)

    machine = _get_machine(indata, prefix, feats)

    try:
        fun = eval(indata[prefix + 'name'])
    except NameError, e:
        print "%s is disabled/unavailable!" % indata[prefix + 'name']
        return False
Example #8
0
def _evaluate (indata):
	prefix='classifier_'
	ctype=indata[prefix+'type']
	if indata[prefix+'name']=='KNN':
		feats=util.get_features(indata, 'distance_')
	elif ctype=='kernel':
		feats=util.get_features(indata, 'kernel_')
	else:
		feats=util.get_features(indata, prefix)

	machine=_get_machine(indata, prefix, feats)

	try:
		fun=eval(indata[prefix+'name'])
	except NameError, e:
		print "%s is disabled/unavailable!"%indata[prefix+'name']
		return False
Example #9
0
def _evaluate(indata):
    prefix = "classifier_"
    ctype = indata[prefix + "type"]
    if indata[prefix + "name"] == "KNN":
        feats = util.get_features(indata, "distance_")
    elif ctype == "kernel":
        feats = util.get_features(indata, "kernel_")
    else:
        feats = util.get_features(indata, prefix)

    machine = _get_machine(indata, prefix, feats)

    try:
        fun = eval(indata[prefix + "name"])
    except NameError, e:
        print "%s is disabled/unavailable!" % indata[prefix + "name"]
        return False
Example #10
0
def _evaluate(indata):
    prefix = 'kernel_'
    feats = util.get_features(indata, prefix)
    kargs = util.get_args(indata, prefix)
    fun = eval(indata[prefix + 'name'] + 'Kernel')
    kernel = fun(feats['train'], feats['train'], *kargs)

    prefix = 'regression_'
    kernel.parallel.set_num_threads(indata[prefix + 'num_threads'])

    try:
        name = indata[prefix + 'name']
        if (name == 'KERNELRIDGEREGRESSION'):
            name = 'KernelRidgeRegression'

        rfun = eval(name)
    except NameError as e:
        print("%s is disabled/unavailable!" % indata[prefix + 'name'])
        return False

    labels = RegressionLabels(double(indata[prefix + 'labels']))
    if indata[prefix + 'type'] == 'svm':
        regression = rfun(indata[prefix + 'C'], indata[prefix + 'epsilon'],
                          kernel, labels)
    elif indata[prefix + 'type'] == 'kernelmachine':
        regression = rfun(indata[prefix + 'tau'], kernel, labels)
    else:
        return False

    regression.parallel.set_num_threads(indata[prefix + 'num_threads'])
    if prefix + 'tube_epsilon' in indata:
        regression.set_tube_epsilon(indata[prefix + 'tube_epsilon'])

    regression.train()

    alphas = 0
    bias = 0
    sv = 0
    if prefix + 'bias' in indata:
        bias = abs(regression.get_bias() - indata[prefix + 'bias'])
    if prefix + 'alphas' in indata:
        for item in regression.get_alphas().tolist():
            alphas += item
        alphas = abs(alphas - indata[prefix + 'alphas'])
    if prefix + 'support_vectors' in indata:
        for item in inregression.get_support_vectors().tolist():
            sv += item
        sv = abs(sv - indata[prefix + 'support_vectors'])

    kernel.init(feats['train'], feats['test'])
    classified = max(
        abs(regression.apply().get_labels() - indata[prefix + 'classified']))

    return util.check_accuracy(indata[prefix + 'accuracy'],
                               alphas=alphas,
                               bias=bias,
                               support_vectors=sv,
                               classified=classified)
Example #11
0
def _evaluate (indata):
	prefix='kernel_'
	feats=util.get_features(indata, prefix)
	kargs=util.get_args(indata, prefix)
	fun=eval(indata[prefix+'name']+'Kernel')
	kernel=fun(feats['train'], feats['train'], *kargs)

	prefix='regression_'
	kernel.parallel.set_num_threads(indata[prefix+'num_threads'])

	try:
		name = indata[prefix+'name']
		if (name=='KERNELRIDGEREGRESSION'):
			name = 'KernelRidgeRegression'

		rfun=eval(name)
	except NameError as e:
		print("%s is disabled/unavailable!"%indata[prefix+'name'])
		return False

	labels=RegressionLabels(double(indata[prefix+'labels']))
	if indata[prefix+'type']=='svm':
		regression=rfun(
			indata[prefix+'C'], indata[prefix+'epsilon'], kernel, labels)
	elif indata[prefix+'type']=='kernelmachine':
		regression=rfun(indata[prefix+'tau'], kernel, labels)
	else:
		return False

	regression.parallel.set_num_threads(indata[prefix+'num_threads'])
	if prefix+'tube_epsilon' in indata:
		regression.set_tube_epsilon(indata[prefix+'tube_epsilon'])

	regression.train()

	alphas=0
	bias=0
	sv=0
	if prefix+'bias' in indata:
		bias=abs(regression.get_bias()-indata[prefix+'bias'])
	if prefix+'alphas' in indata:
		for item in regression.get_alphas().tolist():
			alphas+=item
		alphas=abs(alphas-indata[prefix+'alphas'])
	if prefix+'support_vectors' in indata:
		for item in inregression.get_support_vectors().tolist():
			sv+=item
		sv=abs(sv-indata[prefix+'support_vectors'])

	kernel.init(feats['train'], feats['test'])
	classified=max(abs(
		regression.apply().get_labels()-indata[prefix+'classified']))

	return util.check_accuracy(indata[prefix+'accuracy'], alphas=alphas,
		bias=bias, support_vectors=sv, classified=classified)
Example #12
0
def execute(data):
    """

    :param data: Raw Data frame parsed from CSV
    :return: Nothing
    """

    # 2. Randomizes the data
    randomized_data = util.randomize_data(data)

    # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing
    training_data_size = 2.0 / 3.0
    training_data, test_data = util.split_data(randomized_data,
                                               training_data_size)

    # Capture the predicted outputs
    training_outputs = training_data[training_data.columns[-1]]

    # 4. Standardizes the data (except for the last column of course) using the training data
    training_inputs, training_mean, training_std = util.standardize_data(
        util.get_features(training_data))

    # Add offset column at the front
    training_inputs.insert(0, "Bias", 1)

    # 5. Computes the closed-form solution of linear regression
    weights = find_weights(training_inputs, training_outputs)

    # 6. Applies the solution to the testing samples
    test_input = util.get_features(test_data)
    expected = util.get_output(test_data)
    actual = apply_solution(test_input, training_mean, training_std, weights)

    # 7. Computes the root mean squared error (RMSE)
    rmse = util.compute_rmse(expected, actual)

    return weights, rmse
Example #13
0
def _evaluate (indata):
	prefix='kernel_'
	feats=util.get_features(indata, prefix)
	kargs=util.get_args(indata, prefix)
	fun=eval(indata[prefix+'name']+'Kernel')
	kernel=fun(feats['train'], feats['train'], *kargs)

	prefix='regression_'
	kernel.parallel.set_num_threads(indata[prefix+'num_threads'])

	try:
		rfun=eval(indata[prefix+'name'])
	except NameError, e:
		print "%s is disabled/unavailable!"%indata[prefix+'name']
		return False
Example #14
0
def _evaluate(indata):
    prefix = 'kernel_'
    feats = util.get_features(indata, prefix)
    kargs = util.get_args(indata, prefix)
    fun = eval(indata[prefix + 'name'] + 'Kernel')
    kernel = fun(feats['train'], feats['train'], *kargs)

    prefix = 'regression_'
    kernel.parallel.set_num_threads(indata[prefix + 'num_threads'])

    try:
        rfun = eval(indata[prefix + 'name'])
    except NameError, e:
        print "%s is disabled/unavailable!" % indata[prefix + 'name']
        return False
Example #15
0
def _evaluate(indata):
    prefix = "kernel_"
    feats = util.get_features(indata, prefix)
    kargs = util.get_args(indata, prefix)
    fun = eval(indata[prefix + "name"] + "Kernel")
    kernel = fun(feats["train"], feats["train"], *kargs)

    prefix = "regression_"
    kernel.parallel.set_num_threads(indata[prefix + "num_threads"])

    try:
        rfun = eval(indata[prefix + "name"])
    except NameError, e:
        print "%s is disabled/unavailable!" % indata[prefix + "name"]
        return False
Example #16
0
def _evaluate(indata):
    prefix = 'distribution_'
    feats = util.get_features(indata, prefix)

    if indata[prefix + 'name'] == 'HMM':
        distribution = HMM(feats['train'], indata[prefix + 'N'],
                           indata[prefix + 'M'], indata[prefix + 'pseudo'])
        distribution.train()
        distribution.baum_welch_viterbi_train(BW_NORMAL)
    else:
        dfun = eval(indata[prefix + 'name'])
        distribution = dfun(feats['train'])
        distribution.train()

    likelihood = distribution.get_log_likelihood_sample()
    num_examples = feats['train'].get_num_vectors()
    num_param = distribution.get_num_model_parameters()
    derivatives = 0
    for i in xrange(num_param):
        for j in xrange(num_examples):
            val = distribution.get_log_derivative(i, j)
            if val != -inf and val != nan:  # only consider sparse matrix!
                derivatives += val

    derivatives = abs(derivatives - indata[prefix + 'derivatives'])
    likelihood = abs(likelihood - indata[prefix + 'likelihood'])

    if indata[prefix + 'name'] == 'HMM':
        best_path = 0
        best_path_state = 0
        for i in xrange(indata[prefix + 'num_examples']):
            best_path += distribution.best_path(i)
            for j in xrange(indata[prefix + 'N']):
                best_path_state += distribution.get_best_path_state(i, j)

        best_path = abs(best_path - indata[prefix + 'best_path'])
        best_path_state=abs(best_path_state-\
         indata[prefix+'best_path_state'])

        return util.check_accuracy(indata[prefix + 'accuracy'],
                                   derivatives=derivatives,
                                   likelihood=likelihood,
                                   best_path=best_path,
                                   best_path_state=best_path_state)
    else:
        return util.check_accuracy(indata[prefix + 'accuracy'],
                                   derivatives=derivatives,
                                   likelihood=likelihood)
Example #17
0
def _evaluate (indata):
	prefix='distance_'
	feats=util.get_features(indata, prefix)

	dfun=eval(indata[prefix+'name'])
	dargs=util.get_args(indata, prefix)
	distance=dfun(feats['train'], feats['train'], *dargs)

	dm_train=max(abs(
		indata[prefix+'matrix_train']-distance.get_distance_matrix()).flat)
	distance.init(feats['train'], feats['test'])
	dm_test=max(abs(
		indata[prefix+'matrix_test']-distance.get_distance_matrix()).flat)

	return util.check_accuracy(
		indata[prefix+'accuracy'], dm_train=dm_train, dm_test=dm_test)
Example #18
0
def _evaluate (indata, prefix):
	feats=util.get_features(indata, prefix)
	kfun=eval(indata[prefix+'name']+'Kernel')
	kargs=util.get_args(indata, prefix)
	kernel=kfun(*kargs)
	if indata.has_key(prefix+'normalizer'):
		kernel.set_normalizer(eval(indata[prefix+'normalizer']+'()'))

	kernel.init(feats['train'], feats['train'])
	km_train=max(abs(
		indata[prefix+'matrix_train']-kernel.get_kernel_matrix()).flat)
	kernel.init(feats['train'], feats['test'])
	km_test=max(abs(
		indata[prefix+'matrix_test']-kernel.get_kernel_matrix()).flat)

	return util.check_accuracy(
		indata[prefix+'accuracy'], km_train=km_train, km_test=km_test)
Example #19
0
def _evaluate(indata):
    prefix = "kernel_"
    feats = util.get_features(indata, prefix)
    kfun = eval(indata[prefix + "name"] + "Kernel")
    kargs = util.get_args(indata, prefix)

    prefix = "preproc_"
    pargs = util.get_args(indata, prefix)
    feats = util.add_preproc(indata[prefix + "name"], feats, *pargs)

    prefix = "kernel_"
    kernel = kfun(feats["train"], feats["train"], *kargs)
    km_train = max(abs(indata[prefix + "matrix_train"] - kernel.get_kernel_matrix()).flat)
    kernel.init(feats["train"], feats["test"])
    km_test = max(abs(indata[prefix + "matrix_test"] - kernel.get_kernel_matrix()).flat)

    return util.check_accuracy(indata[prefix + "accuracy"], km_train=km_train, km_test=km_test)
Example #20
0
def _evaluate (indata):
	prefix='distribution_'
	feats=util.get_features(indata, prefix)

	if indata[prefix+'name']=='HMM':
		distribution=HMM(feats['train'], indata[prefix+'N'],
			indata[prefix+'M'], indata[prefix+'pseudo'])
		distribution.train()
		distribution.baum_welch_viterbi_train(BW_NORMAL)
	else:
		dfun=eval(indata[prefix+'name'])
		distribution=dfun(feats['train'])
		distribution.train()

	likelihood=distribution.get_log_likelihood_sample()
	num_examples=feats['train'].get_num_vectors()
	num_param=distribution.get_num_model_parameters()
	derivatives=0
	for i in xrange(num_param):
		for j in xrange(num_examples):
			val=distribution.get_log_derivative(i, j)
			if val!=-inf and val!=nan: # only consider sparse matrix!
				derivatives+=val

	derivatives=abs(derivatives-indata[prefix+'derivatives'])
	likelihood=abs(likelihood-indata[prefix+'likelihood'])

	if indata[prefix+'name']=='HMM':
		best_path=0
		best_path_state=0
		for i in xrange(indata[prefix+'num_examples']):
			best_path+=distribution.best_path(i)
			for j in xrange(indata[prefix+'N']):
				best_path_state+=distribution.get_best_path_state(i, j)

		best_path=abs(best_path-indata[prefix+'best_path'])
		best_path_state=abs(best_path_state-\
			indata[prefix+'best_path_state'])

		return util.check_accuracy(indata[prefix+'accuracy'],
			derivatives=derivatives, likelihood=likelihood,
			best_path=best_path, best_path_state=best_path_state)
	else:
		return util.check_accuracy(indata[prefix+'accuracy'],
			derivatives=derivatives, likelihood=likelihood)
Example #21
0
def main():
    t1 = time.time()
    X, y, raw = util.get_data('../data/subset.csv')
    new_features = util.get_features(raw)  # get homegrown features
    vect = TfidfVectorizer(min_df=2)
    X_dtm = vect.fit_transform(X)
    info_gains = np.apply_along_axis(util.info_gain, 0, X_dtm.toarray(), y,
                                     0.00001)
    num_features = 2000
    max_cols = info_gains.argsort()[-num_features:][::-1]
    # print_vocab(vect, max_cols)
    X = X_dtm[:,
              max_cols].toarray()  # turn X from sparse matrix to numpy array
    for new_feature in new_features:  # add our features as columns to X
        X = np.append(X, new_feature.reshape(-1, 1), axis=1)
    print("data matrix shape", X.shape)
    print("preprocessing took", str(time.time() - t1), "seconds")
    tune_rbf(X, y)
Example #22
0
def _evaluate(indata, prefix):
    feats = util.get_features(indata, prefix)
    kfun = eval(indata[prefix + 'name'] + 'Kernel')
    kargs = util.get_args(indata, prefix)
    kernel = kfun(*kargs)
    if indata.has_key(prefix + 'normalizer'):
        kernel.set_normalizer(eval(indata[prefix + 'normalizer'] + '()'))

    kernel.init(feats['train'], feats['train'])
    km_train = max(
        abs(indata[prefix + 'matrix_train'] - kernel.get_kernel_matrix()).flat)
    kernel.init(feats['train'], feats['test'])
    km_test = max(
        abs(indata[prefix + 'matrix_test'] - kernel.get_kernel_matrix()).flat)

    return util.check_accuracy(indata[prefix + 'accuracy'],
                               km_train=km_train,
                               km_test=km_test)
Example #23
0
def _evaluate(indata):
    prefix = 'distance_'
    feats = util.get_features(indata, prefix)

    dfun = eval(indata[prefix + 'name'])
    dargs = util.get_args(indata, prefix)
    distance = dfun(feats['train'], feats['train'], *dargs)

    dm_train = max(
        abs(indata[prefix + 'matrix_train'] -
            distance.get_distance_matrix()).flat)
    distance.init(feats['train'], feats['test'])
    dm_test = max(
        abs(indata[prefix + 'matrix_test'] -
            distance.get_distance_matrix()).flat)

    return util.check_accuracy(indata[prefix + 'accuracy'],
                               dm_train=dm_train,
                               dm_test=dm_test)
Example #24
0
def _evaluate_combined (indata, prefix):
	kernel=CombinedKernel()
	feats={'train':CombinedFeatures(), 'test':CombinedFeatures()}

	subkernels=_get_subkernels(indata, prefix)
	for subk in subkernels.itervalues():
		feats_subk=util.get_features(subk, '')
		feats['train'].append_feature_obj(feats_subk['train'])
		feats['test'].append_feature_obj(feats_subk['test'])
		kernel.append_kernel(subk['kernel'])

	kernel.init(feats['train'], feats['train'])
	km_train=max(abs(
		indata['kernel_matrix_train']-kernel.get_kernel_matrix()).flat)
	kernel.init(feats['train'], feats['test'])
	km_test=max(abs(
		indata['kernel_matrix_test']-kernel.get_kernel_matrix()).flat)

	return util.check_accuracy(indata[prefix+'accuracy'],
		km_train=km_train, km_test=km_test)
Example #25
0
def extractor(cap, dim):
    _, frame = cap.read()
    frame = scale(frame, dim)
    prev_gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
    prev_flow = None
    for i in count(0):
        _, frame = cap.read()
        if frame is None:
            break
        frame = scale(frame, dim)
        gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
        flow = cv.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 3, 15,
                                           3, 5, 1.2, 0)
        mag, ang = cv.cartToPolar(flow[..., 0], flow[..., 1])

        if i > 1:
            yield get_features(prev_flow, flow)

        prev_gray = gray
        prev_flow = flow
Example #26
0
def _evaluate_auc (indata, prefix):
	subk=_get_subkernels(indata, prefix)['0']
	feats_subk=util.get_features(subk, '')
	subk['kernel'].init(feats_subk['train'], feats_subk['test'])

	feats={
		'train': WordFeatures(indata[prefix+'data_train'].astype(ushort)),
		'test': WordFeatures(indata[prefix+'data_test'].astype(ushort))
	}
	kernel=AUCKernel(10, subk['kernel'])

	kernel.init(feats['train'], feats['train'])
	km_train=max(abs(
		indata[prefix+'matrix_train']-kernel.get_kernel_matrix()).flat)
	kernel.init(feats['train'], feats['test'])
	km_test=max(abs(
		indata[prefix+'matrix_test']-kernel.get_kernel_matrix()).flat)

	return util.check_accuracy(indata[prefix+'accuracy'],
		km_train=km_train, km_test=km_test)
Example #27
0
def _evaluate (indata):
	prefix='kernel_'
	feats=util.get_features(indata, prefix)
	kfun=eval(indata[prefix+'name']+'Kernel')
	kargs=util.get_args(indata, prefix)

	prefix='preprocessor_'
	pargs=util.get_args(indata, prefix)
	feats=util.add_preprocessor(indata[prefix+'name'], feats, *pargs)

	prefix='kernel_'
	kernel=kfun(feats['train'], feats['train'], *kargs)
	km_train=max(abs(
		indata[prefix+'matrix_train']-kernel.get_kernel_matrix()).flat)
	kernel.init(feats['train'], feats['test'])
	km_test=max(abs(
		indata[prefix+'matrix_test']-kernel.get_kernel_matrix()).flat)

	return util.check_accuracy(
		indata[prefix+'accuracy'], km_train=km_train, km_test=km_test)
Example #28
0
def _evaluate_top_fisher(indata, prefix):
    feats = {}
    wordfeats = util.get_features(indata, prefix)

    pos_train = HMM(wordfeats['train'], indata[prefix + 'N'],
                    indata[prefix + 'M'], indata[prefix + 'pseudo'])
    pos_train.train()
    pos_train.baum_welch_viterbi_train(BW_NORMAL)
    neg_train = HMM(wordfeats['train'], indata[prefix + 'N'],
                    indata[prefix + 'M'], indata[prefix + 'pseudo'])
    neg_train.train()
    neg_train.baum_welch_viterbi_train(BW_NORMAL)
    pos_test = HMM(pos_train)
    pos_test.set_observations(wordfeats['test'])
    neg_test = HMM(neg_train)
    neg_test.set_observations(wordfeats['test'])

    if indata[prefix + 'name'] == 'TOP':
        feats['train'] = TOPFeatures(10, pos_train, neg_train, False, False)
        feats['test'] = TOPFeatures(10, pos_test, neg_test, False, False)
    else:
        feats['train'] = FKFeatures(10, pos_train, neg_train)
        feats['train'].set_opt_a(-1)  #estimate prior
        feats['test'] = FKFeatures(10, pos_test, neg_test)
        feats['test'].set_a(
            feats['train'].get_a())  #use prior from training data

    prefix = 'kernel_'
    args = util.get_args(indata, prefix)
    kernel = PolyKernel(feats['train'], feats['train'], *args)
    #	kernel=PolyKernel(*args)
    #	kernel.init(feats['train'], feats['train'])
    km_train = max(
        abs(indata[prefix + 'matrix_train'] - kernel.get_kernel_matrix()).flat)
    kernel.init(feats['train'], feats['test'])
    km_test = max(
        abs(indata[prefix + 'matrix_test'] - kernel.get_kernel_matrix()).flat)

    return util.check_accuracy(indata[prefix + 'accuracy'],
                               km_train=km_train,
                               km_test=km_test)
Example #29
0
def _evaluate_combined(indata, prefix):
    kernel = CombinedKernel()
    feats = {'train': CombinedFeatures(), 'test': CombinedFeatures()}

    subkernels = _get_subkernels(indata, prefix)
    for subk in subkernels.itervalues():
        feats_subk = util.get_features(subk, '')
        feats['train'].append_feature_obj(feats_subk['train'])
        feats['test'].append_feature_obj(feats_subk['test'])
        kernel.append_kernel(subk['kernel'])

    kernel.init(feats['train'], feats['train'])
    km_train = max(
        abs(indata['kernel_matrix_train'] - kernel.get_kernel_matrix()).flat)
    kernel.init(feats['train'], feats['test'])
    km_test = max(
        abs(indata['kernel_matrix_test'] - kernel.get_kernel_matrix()).flat)

    return util.check_accuracy(indata[prefix + 'accuracy'],
                               km_train=km_train,
                               km_test=km_test)
Example #30
0
def _evaluate_auc(indata, prefix):
    subk = _get_subkernels(indata, prefix)['0']
    feats_subk = util.get_features(subk, '')
    subk['kernel'].init(feats_subk['train'], feats_subk['test'])

    feats = {
        'train': WordFeatures(indata[prefix + 'data_train'].astype(ushort)),
        'test': WordFeatures(indata[prefix + 'data_test'].astype(ushort))
    }
    kernel = AUCKernel(10, subk['kernel'])

    kernel.init(feats['train'], feats['train'])
    km_train = max(
        abs(indata[prefix + 'matrix_train'] - kernel.get_kernel_matrix()).flat)
    kernel.init(feats['train'], feats['test'])
    km_test = max(
        abs(indata[prefix + 'matrix_test'] - kernel.get_kernel_matrix()).flat)

    return util.check_accuracy(indata[prefix + 'accuracy'],
                               km_train=km_train,
                               km_test=km_test)
Example #31
0
def _evaluate(indata):
    prefix = 'kernel_'
    feats = util.get_features(indata, prefix)
    kfun = eval(indata[prefix + 'name'] + 'Kernel')
    kargs = util.get_args(indata, prefix)

    prefix = 'preproc_'
    pargs = util.get_args(indata, prefix)
    feats = util.add_preproc(indata[prefix + 'name'], feats, *pargs)

    prefix = 'kernel_'
    kernel = kfun(feats['train'], feats['train'], *kargs)
    km_train = max(
        abs(indata[prefix + 'matrix_train'] - kernel.get_kernel_matrix()).flat)
    kernel.init(feats['train'], feats['test'])
    km_test = max(
        abs(indata[prefix + 'matrix_test'] - kernel.get_kernel_matrix()).flat)

    return util.check_accuracy(indata[prefix + 'accuracy'],
                               km_train=km_train,
                               km_test=km_test)
Example #32
0
def _evaluate_pie (indata, prefix):
	pie=PluginEstimate()
	feats=util.get_features(indata, prefix)
	labels=BinaryLabels(double(indata['classifier_labels']))
	pie.set_labels(labels)
	pie.set_features(feats['train'])
	pie.train()

	fun=eval(indata[prefix+'name']+'Kernel')
	kernel=fun(feats['train'], feats['train'], pie)
	km_train=max(abs(
		indata[prefix+'matrix_train']-kernel.get_kernel_matrix()).flat)

	kernel.init(feats['train'], feats['test'])
	pie.set_features(feats['test'])
	km_test=max(abs(
		indata[prefix+'matrix_test']-kernel.get_kernel_matrix()).flat)
	classified=max(abs(
		pie.apply().get_values()-indata['classifier_classified']))

	return util.check_accuracy(indata[prefix+'accuracy'],
		km_train=km_train, km_test=km_test, classified=classified)
Example #33
0
def _evaluate_top_fisher (indata, prefix):
	feats={}
	wordfeats=util.get_features(indata, prefix)

	pos_train=HMM(wordfeats['train'], indata[prefix+'N'], indata[prefix+'M'],
		indata[prefix+'pseudo'])
	pos_train.train()
	pos_train.baum_welch_viterbi_train(BW_NORMAL)
	neg_train=HMM(wordfeats['train'], indata[prefix+'N'], indata[prefix+'M'],
		indata[prefix+'pseudo'])
	neg_train.train()
	neg_train.baum_welch_viterbi_train(BW_NORMAL)
	pos_test=HMM(pos_train)
	pos_test.set_observations(wordfeats['test'])
	neg_test=HMM(neg_train)
	neg_test.set_observations(wordfeats['test'])

	if indata[prefix+'name']=='TOP':
		feats['train']=TOPFeatures(10, pos_train, neg_train, False, False)
		feats['test']=TOPFeatures(10, pos_test, neg_test, False, False)
	else:
		feats['train']=FKFeatures(10, pos_train, neg_train)
		feats['train'].set_opt_a(-1) #estimate prior
		feats['test']=FKFeatures(10, pos_test, neg_test)
		feats['test'].set_a(feats['train'].get_a()) #use prior from training data

	prefix='kernel_'
	args=util.get_args(indata, prefix)
	kernel=PolyKernel(feats['train'], feats['train'], *args)
#	kernel=PolyKernel(*args)
#	kernel.init(feats['train'], feats['train'])
	km_train=max(abs(
		indata[prefix+'matrix_train']-kernel.get_kernel_matrix()).flat)
	kernel.init(feats['train'], feats['test'])
	km_test=max(abs(
		indata[prefix+'matrix_test']-kernel.get_kernel_matrix()).flat)

	return util.check_accuracy(indata[prefix+'accuracy'],
		km_train=km_train, km_test=km_test)
Example #34
0
def _evaluate_pie(indata, prefix):
    pie = PluginEstimate()
    feats = util.get_features(indata, prefix)
    labels = BinaryLabels(double(indata['classifier_labels']))
    pie.set_labels(labels)
    pie.set_features(feats['train'])
    pie.train()

    fun = eval(indata[prefix + 'name'] + 'Kernel')
    kernel = fun(feats['train'], feats['train'], pie)
    km_train = max(
        abs(indata[prefix + 'matrix_train'] - kernel.get_kernel_matrix()).flat)

    kernel.init(feats['train'], feats['test'])
    pie.set_features(feats['test'])
    km_test = max(
        abs(indata[prefix + 'matrix_test'] - kernel.get_kernel_matrix()).flat)
    classified = max(
        abs(pie.apply().get_confidences() - indata['classifier_classified']))

    return util.check_accuracy(indata[prefix + 'accuracy'],
                               km_train=km_train,
                               km_test=km_test,
                               classified=classified)
def execute(data,
            learning_rate=0.001,
            training_data_ratio=2.0 / 3,
            max_iterations=1000000):
    """
    Perform Batch Gradient Descent

    :param data: Raw Data frame parsed from CSV
    :param learning_rate: The rate at which to advance along the gradient
    :param training_data_ratio: The percent of given data to use for training (remaining percent is used for testing)
    :param max_iterations: The maximum number of iterations to execute before exiting
    :return: Nothing
    """

    # 2. Randomizes the data
    print "Randomizing Data"
    randomized_data = util.randomize_data(data)

    # 3. Selects the first 2 / 3 (round up) of the data for training and the remaining for testing
    print "Selecting Training Data"
    training_data, test_data = util.split_data(randomized_data,
                                               training_data_ratio)

    # 4. Standardizes the data(except for the last column of course) base on the training data
    print "Standardizing Data"
    std_training_data, mean, std = util.standardize_data(
        util.get_features(training_data))
    std_training_data.insert(0, "Bias", 1)

    std_test_data, _, _ = util.standardize_data(util.get_features(test_data),
                                                mean, std)
    std_test_data.insert(0, "Bias", 1)

    iteration = 0
    prior_rmse = 0
    current_rmse = 100  # Doesn't matter what this value is, so long as it doesn't equal prior rmse
    eps = np.spacing(1)
    N = len(std_training_data)

    # Start with randomized values for theta
    theta = np.array([random.uniform(-1, 1) for _ in xrange(0, 3)])

    # Capture our expected values for the training data
    expected = util.get_output(training_data)
    test_data_expected = util.get_output(test_data)

    # Capture the RMSE for test and training over all iterations
    test_rmse_values = []
    training_rmse_values = []

    print "Performing Gradient Descent Linear Regression"
    # 5. While the termination criteria (mentioned above in the implementation details) hasn't been met
    while iteration <= max_iterations and abs(current_rmse -
                                              prior_rmse) >= eps:
        prior_rmse = current_rmse

        #   (a) Compute the RMSE of the training data
        #       By applying the current theta values to the training set & comparing results
        actual = std_training_data.dot(theta)
        current_rmse = util.compute_rmse(expected, actual)

        #   (b) While we can't let the testing set affect our training process, also compute the RMSE of
        #       the testing error at each iteration of the algorithm (it'll be interesting to see).
        #       Same thing as (a), but use test inputs / outputs
        test_data_actual = std_test_data.dot(theta)
        test_data_rmse = util.compute_rmse(test_data_expected,
                                           test_data_actual)

        #   (c) Update each parameter using batch gradient descent
        #       By use of the learning rate
        for i in xrange(len(theta)):
            # We know the length of theta is the same as the num columns in std_training_data
            errors = (actual - expected
                      ) * std_training_data[std_training_data.columns[i]]
            cumulative_error = errors.sum()
            theta[i] -= learning_rate / N * cumulative_error

        iteration += 1
        test_rmse_values.append(test_data_rmse)
        training_rmse_values.append(current_rmse)

    print "Completed in {0} iterations".format(iteration)

    print "Plotting Errors"
    image_path = plot_rmse_values(test_rmse_values, training_rmse_values,
                                  learning_rate)
    print "Saved Image to '{0}'".format(image_path)

    # 6. Compute the RMSE of the testing data.
    print "Computing RMSE of Test Data"
    test_data_actual = std_test_data.dot(theta)
    test_data_rmse = util.compute_rmse(test_data_expected, test_data_actual)
    return theta, test_data_rmse
Example #36
0
    hidden = 100
    most_common = 1600
    filename = './data/test.en'

epsilon_std = 1.0
window_size = 5

context_sz = window_size * 2

tr_word2idx, tr_idx2word, sent_train = util.read_input(filename,
                                                       most_common=most_common)

tst_word2idx, tst_idx2word, sent_test = util.read_input('./data/test.en')
corpus_dim = len(tr_word2idx)
original_dim = corpus_dim
x_train = util.get_features(sent_train, tr_word2idx, window_size, emb_sz)
corpus_sz = len(tr_word2idx)
flatten_sz = x_train.shape[0] * x_train.shape[1]
emb_sz_2 = emb_sz * 2

#x_train_hat = np.reshape(x_train, (flatten_sz,emb_sz_2))
#print('shape x_train_hat=', x_train_hat.shape)

# ENCODER
x = Input(shape=(2, ))

R = Embedding(input_dim=original_dim, output_dim=emb_sz)(x)
R = Reshape((-1, emb_sz_2))(R)
print('shape R=', R.shape)

M = Dense(hidden)(R)
Example #37
0
def _evaluate(indata):
    prefix = 'classifier_'
    ctype = indata[prefix + 'type']
    if indata[prefix + 'name'] == 'KNN':
        feats = util.get_features(indata, 'distance_')
    elif ctype == 'kernel':
        feats = util.get_features(indata, 'kernel_')
    else:
        feats = util.get_features(indata, prefix)

    machine = _get_machine(indata, prefix, feats)

    try:
        fun = eval(indata[prefix + 'name'])
    except NameError as e:
        print("%s is disabled/unavailable!" % indata[prefix + 'name'])
        return False

    # cannot refactor into function, because labels is unrefed otherwise
    if prefix + 'labels' in indata:
        labels = BinaryLabels(double(indata[prefix + 'labels']))
        if ctype == 'kernel':
            classifier = fun(indata[prefix + 'C'], machine, labels)
        elif ctype == 'linear':
            classifier = fun(indata[prefix + 'C'], feats['train'], labels)
        elif ctype == 'knn':
            classifier = fun(indata[prefix + 'k'], machine, labels)
        elif ctype == 'lda':
            classifier = fun(indata[prefix + 'gamma'], feats['train'], labels)
        elif ctype == 'perceptron':
            classifier = fun(feats['train'], labels)
        elif ctype == 'wdsvmocas':
            classifier = fun(indata[prefix + 'C'], indata[prefix + 'degree'],
                             indata[prefix + 'degree'], feats['train'], labels)
        else:
            return False
    else:
        classifier = fun(indata[prefix + 'C'], machine)

    if classifier.get_name() == 'LibLinear':
        print(classifier.get_name(), "yes")
        classifier.set_liblinear_solver_type(L2R_LR)

    classifier.parallel.set_num_threads(indata[prefix + 'num_threads'])
    if ctype == 'linear':
        if prefix + 'bias' in indata:
            classifier.set_bias_enabled(True)
        else:
            classifier.set_bias_enabled(False)
    if ctype == 'perceptron':
        classifier.set_learn_rate = indata[prefix + 'learn_rate']
        classifier.set_max_iter = indata[prefix + 'max_iter']
    if prefix + 'epsilon' in indata:
        try:
            classifier.set_epsilon(indata[prefix + 'epsilon'])
        except AttributeError:
            pass
    if prefix + 'max_train_time' in indata:
        classifier.set_max_train_time(indata[prefix + 'max_train_time'])
    if prefix + 'linadd_enabled' in indata:
        classifier.set_linadd_enabled(indata[prefix + 'linadd_enabled'])
    if prefix + 'batch_enabled' in indata:
        classifier.set_batch_computation_enabled(indata[prefix +
                                                        'batch_enabled'])

    classifier.train()

    res = _get_results(indata, prefix, classifier, machine, feats)
    return util.check_accuracy(res['accuracy'],
                               alphas=res['alphas'],
                               bias=res['bias'],
                               sv=res['sv'],
                               classified=res['classified'])
Example #38
0
def _evaluate (indata):
	prefix='classifier_'
	ctype=indata[prefix+'type']
	if indata[prefix+'name']=='KNN':
		feats=util.get_features(indata, 'distance_')
	elif ctype=='kernel':
		feats=util.get_features(indata, 'kernel_')
	else:
		feats=util.get_features(indata, prefix)

	machine=_get_machine(indata, prefix, feats)

	try:
		fun=eval(indata[prefix+'name'])
	except NameError as e:
		print("%s is disabled/unavailable!"%indata[prefix+'name'])
		return False

	# cannot refactor into function, because labels is unrefed otherwise
	if prefix+'labels' in indata:
		labels=BinaryLabels(double(indata[prefix+'labels']))
		if ctype=='kernel':
			classifier=fun(indata[prefix+'C'], machine, labels)
		elif ctype=='linear':
			classifier=fun(indata[prefix+'C'], feats['train'], labels)
		elif ctype=='knn':
			classifier=fun(indata[prefix+'k'], machine, labels)
		elif ctype=='lda':
			classifier=fun(indata[prefix+'gamma'], feats['train'], labels)
		elif ctype=='perceptron':
			classifier=fun(feats['train'], labels)
		elif ctype=='wdsvmocas':
			classifier=fun(indata[prefix+'C'], indata[prefix+'degree'],
				indata[prefix+'degree'], feats['train'], labels)
		else:
			return False
	else:
		classifier=fun(indata[prefix+'C'], machine)

	if classifier.get_name() == 'LibLinear':
		print(classifier.get_name(), "yes")
		classifier.set_liblinear_solver_type(L2R_LR)

	classifier.parallel.set_num_threads(indata[prefix+'num_threads'])
	if ctype=='linear':
		if prefix+'bias' in indata:
			classifier.set_bias_enabled(True)
		else:
			classifier.set_bias_enabled(False)
	if ctype=='perceptron':
		classifier.set_learn_rate=indata[prefix+'learn_rate']
		classifier.set_max_iter=indata[prefix+'max_iter']
	if prefix+'epsilon' in indata:
		try:
			classifier.set_epsilon(indata[prefix+'epsilon'])
		except AttributeError:
			pass
	if prefix+'max_train_time' in indata:
		classifier.set_max_train_time(indata[prefix+'max_train_time'])
	if prefix+'linadd_enabled' in indata:
		classifier.set_linadd_enabled(indata[prefix+'linadd_enabled'])
	if prefix+'batch_enabled' in indata:
		classifier.set_batch_computation_enabled(indata[prefix+'batch_enabled'])

	classifier.train()

	res=_get_results(indata, prefix, classifier, machine, feats)
	return util.check_accuracy(res['accuracy'],
		alphas=res['alphas'], bias=res['bias'], sv=res['sv'],
		classified=res['classified'])
Example #39
0
    emb_sz=100
    hidden=100
    most_common = 1600
    filename = './data/test.en'

epsilon_std = 1.0
window_size=5

context_sz=window_size*2

tr_word2idx, tr_idx2word, sent_train, corpus = util.read_input(filename, most_common=most_common)

tst_word2idx, tst_idx2word,  sent_test, corpus = util.read_input('./data/test.en')
corpus_dim = len(tr_word2idx)
original_dim = corpus_dim
contexts, targets= util.get_features(sent_train, tr_word2idx, window_size, emb_sz)
corpus_sz = len(tr_word2idx)
emb_sz_2 = emb_sz*2


def concat(input):
    return(K.concatenate([input[0], input[1]]))


def sampling(args):
    # Reparametrization trick
    z_mean, z_log_var = args
    print('shape z_mean sampling=', z_log_var.shape, 'shape z_log_var=', z_log_var.shape)
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], emb_sz), mean=0.,
                              stddev=epsilon_std)
Example #40
0
File: test.py Project: aascode/MEx
import mex
from keras.utils import np_utils
import util

path = '/Users/anjanawijekoon/MEx_wtpm/'

# read all data
all_data = mex.read_all(path)
# extract windows from all three sensors
all_features = mex.extract_features(all_data)

# get features by sensor index
acw_features = util.get_features(all_features, 0)
act_features = util.get_features(all_features, 1)
pm_features = util.get_features(all_features, 2)

# get all people ids
all_people = all_features.keys()

# pm
# to make sure all windows have same length
padded_pm_features = util.pad_features(pm_features)
# to reduce the frame rate to mex.frames_per_second rate
reduced_pm_features = util.frame_reduce(padded_pm_features)

for i in range(len(all_people)):
    test_persons = [all_people[i]]
    pm_train_features, pm_test_features = util.train_test_split(
        reduced_pm_features, test_persons)

    pm_train_features, pm_train_labels = util.flatten(pm_train_features)