Beispiel #1
0
def main(argv):
    # wandb.init(project="WME-Nyst and CUR")
    # wandb.config.update(flags.FLAGS)
    # logging.info('Running with args %s', str(argv))

    # get dataset
    dataset = FLAGS.dataset
    if dataset == "ohsumed":
        filename = "oshumed_K_set1.mat"
        if FLAGS.run_mode == "test":
            test_filename = "oshumed_K_set1.mat"
        version = "v7.3"
    if dataset == "twitter":
        filename = "twitter_K_set1.mat"
        if FLAGS.run_mode == "test":
            test_filename = "twitter_K_set1.mat"
        version = "default"
    if dataset == "news":
        filename = "20ng2_new_K_set1.mat"
        if FLAGS.run_mode == "test":
            test_filename = "20ng2_new_K_set1.mat"
        version = "v7.3"
    if dataset == "recipe":
        filename = "recipe_trainData.mat"
        if FLAGS.run_mode == "test":
            test_filename = "recipe_K_set1"
        version = "v7.3"

    approximator = FLAGS.method
    if approximator not in ["nystrom", "CUR"]:
        print("please choose between nystrom and CUR for approximator")
        return None

    # get EMD matrix
    # similarity_matrix, labels = read_mat_file(\
    #                                 file_="/mnt/nfs/work1/elm/ray/"+filename,\
    #                                 version=version, return_type="all")
    similarity_matrix, labels = read_mat_file(\
                                    file_="./WordMoversEmbeddings/mat_files/"+filename,\
                                    version=version, return_type="all")

    # set hyperparameters
    config = {"samples":FLAGS.sample_size,\
              "CV":10, \
              "gamma": FLAGS.gamma,\
              "lambda_inverse":FLAGS.lambda_inverse,\
              "approximator":approximator,
              "run_mode":FLAGS.run_mode}

    if config["run_mode"] == "test":
        test_sim_mat, test_labels = read_mat_file(\
                                                file_="./WordMoversEmbeddings/mat_files/"+test_filename,\
                                                version=version, return_type="all", mode="test")

    if config["run_mode"] == "validate":
        train_all(similarity_matrix, labels, config)
    else:
        train_all(similarity_matrix, labels, config, \
            X_test=test_sim_mat, Y_test=test_labels)
    return None
Beispiel #2
0
def create_spectrogram_images(mat_file_path):
    data, properties = utils.read_mat_file(mat_file_path)

    spectrograms = []
    for data_channel in data.T:
        spectrograms.append(utils.convert_to_spectrogram(data_channel))

    return np.array(spectrograms)
Beispiel #3
0
def get_features(mat_file):

    d, _ = ut.read_mat_file(mat_file)

    # 5 seconds = 600/5 = 120 windows
    window_size = d.shape[0] / 120

    for i in range(0, d.shape[0], window_size):
        wd = d[i:i + window_size, :].T

        bf_feats = np.array(map(get_butter_features, wd))
        ff_feats = np.array(map(get_fft_features, wd))
        xcorr_feats = get_corr_features(wd)

        yield np.append(np.append(bf_feats, ff_feats), xcorr_feats)
Beispiel #4
0
distorts = [0, 10, 20, 30, 40]
evaluator = StructuredAccuracy()

for distort in distorts:
	print 'data with ' + str(distort) + '% of total labels distorted'

	### prepare training, test data and evaluator

	train_data_file = 'hmsvm_%d_distort_data_fold' % distort
	train_num_examples_fold = 20
	train_num_folds = 5
	train_labels, train_features = utils.unfold_data(train_data_file)

	test_data_file = 'hmsvm_%d_distort_data_test' % distort
	test_num_examples = 100
	test_labels, test_features = utils.read_mat_file(test_data_file, test_num_examples)

	### train ML-HMM and evaluate in training data

	model = HMSVMModel(train_features, train_labels, SMT_TWO_STATE)
	model.set_use_plifs(True)
	mlhmm = MLHMM(model)
	mlhmm.train()

	prediction = mlhmm.apply()
	accuracy = evaluator.evaluate(prediction, train_labels)
	print '\ttraining accuracy:\t' + str(accuracy*100) + '%'
	utils.print_statistics(train_labels, prediction)

	### evaluate in test data
def main(variables):
    data = read_mat_file(variables["data_path"])
    f1_data, f2_data = np.array(data['F1']), np.array(data['F2'])
    n_samples = f1_data.shape[0]
    ground_truth = np.array([[0, 1, 2, 3, 4] for _ in range(n_samples)])
    print("About the data")
    print("Source of data: ", variables["data_path"])
    print("Classes of data: 0,1,2,3,4")
    print("No. of samples: ", n_samples, "\n")

    #Training on 100 samples

    #m_std is dictionary of f1, f2 for each column, c1 c2 c3 c4 and c5.
    print("\n---------- Section 1: Training -------------")
    print("\n Calculating the means and standard deviations for 100 samples\n")
    train_size = variables['training_size']
    b1 = Bayes_Classifier(f1_data, train_size)
    m_std_train = b1.train()

    ## Section 2.1: Testing
    print("\n---------- Section 2.1: Testing -------------")
    print("\n Predicting the classes for 101: 1000 samples")

    predicted = b1.predict()

    ## Section 2.2: Calculating accuracy and error rate
    print(
        "\n---------- Section 2.2: Calculating accuracy for the classifier -------------"
    )
    print("\nAccuracy for the Bayes classifier: ")
    acc = b1.validate(predicted)

    ## Section 3: Standard Normal (z score)
    print("---------- Section 3: Standard normal(Z Score) -------------")

    # z1_data is the standard normalized data.
    z1_data=np.swapaxes(np.array([std_normalize(f1_data[:,i],m_std_train['f1'][i]['m'],\
                          m_std_train['f1'][i]['std'])
            for i in range(5)]),0,1)
    print("Plot of Z1 vs F2")
    plot_clustered_graph(z1_data.flatten(),
                         f2_data.flatten(),
                         ground_truth.flatten(),
                         name="z1vsf2.png",
                         labels=['z1', 'f2'])

    # z1_data is the standard normalized data.
    print("\n Plot of F1 vs F2")
    plot_clustered_graph(f1_data.flatten(),
                         f2_data.flatten(),
                         ground_truth.flatten(),
                         name="f1vsf2.png",
                         labels=['f1', 'f2'])

    ## Section 4
    ### Case 1: Training with the z1 data
    print(
        "\n---------- Section 4, Case 2: Training with the z1 data -------------"
    )
    b = Bayes_Classifier(z1_data)
    b.train()
    predicted = b.predict()
    acc = b.validate(predicted)

    print(
        "\n---------- Section 4, Case 3: Training with the f2 data -------------"
    )
    b = Bayes_Classifier(f2_data)
    b.train()
    predicted = b.predict()
    acc = b.validate(predicted)

    print(
        "\n---------- Section 4, Case 4: Training with the [z1, f2] data -------------"
    )
    data = {'z1': z1_data, 'f2': f2_data}
    b = Multivariate_Bayes_Classifier(data)
    b.train()
    predicted = b.predict()
    acc = b.validate(predicted)
Beispiel #6
0
K = 5
# number of examples per fold
num_fold_examples = 20
# length of each example
example_len = 250
# number of features per example
num_features = 10
# the number different label values
num_states = 2
# K models that will contain the data of each fold
models = []

# load each data fold in a HMSVMModel
data_file = 'hmsvm_30_distort_data_fold'
for k in xrange(K):
	labels, features = utils.read_mat_file('%s_%d' % (data_file, k), num_fold_examples)
	models.append(HMSVMModel(features, labels, SMT_TWO_STATE))

# put together folds, leaving out one of them for each set
labels_no_fold   = []
features_no_fold = []
# for each fold
for k1 in xrange(K):

	# put all labels/features together except the ones of the current fold
	labels_no_kfold = SequenceLabels(num_fold_examples*(K-1), num_states)
	features_no_kfold = RealMatrixFeatures(num_fold_examples*(K-1), num_features)
	# index for the next feature vector to set
	idx = 0

	for k2 in xrange(K):
Beispiel #7
0
def compute_min_eig(Z):
    min_eigs = np.linalg.eigvals(Z)
    return min_eigs

# read mat
filetype = None
dataset = sys.argv[1]
if dataset == "PSD":
    feats = np.random.random((1000,1000))
    similarity_matrix = feats @ feats.T
    filetype = "numpy"
if dataset == "mrpc" or dataset == "rte" or dataset == "stsb":
    filename = "../GYPSUM/"+dataset+"_predicts_0.npy"
    filetype = "python"
if dataset == "twitter":
    similarity_matrix = read_mat_file(file_="./WordMoversEmbeddings/mat_files/twitter_K_set1.mat")
if filetype == "python":
    similarity_matrix = read_file(filename)


sample_size = int(sys.argv[2])
runs = 50
n_bins = 50

# check for similar rows or columns
if dataset != "PSD":
    unique_rows, indices = np.unique(similarity_matrix, axis=0, return_index=True)
    similarity_matrix_O = similarity_matrix[indices][:, indices]
    # symmetrization
    similarity_matrix = (similarity_matrix_O + similarity_matrix_O.T) / 2.0
Beispiel #8
0
    if return_type == "error":
        return np.linalg.norm(\
                similarity_matrix - \
                KS @ np.linalg.pinv(A) @ KS.T)\
                / np.linalg.norm(similarity_matrix), min_eig


##########################################################################################
step = 50
runs_ = 3
"""
20ng2_new_K_set1.mat  oshumed_K_set1.mat  recipe_K_set1.mat  recipe_trainData.mat  twitter_K_set1.mat  twitter_set1.mat
"""
# filename = "stsb"
id_count = 500  #len(similarity_matrix) #1000
similarity_matrix = read_mat_file(file_="WordMoversEmbeddings/mat_files/recipe_trainData.mat",\
    version="v7.3")
# similarity_matrix = read_file("../GYPSUM/"+filename+"_predicts_0.npy")
# check for similar rows or columns
unique_rows, indices = np.unique(similarity_matrix, axis=0, return_index=True)
similarity_matrix_O = similarity_matrix[indices][:, indices]
# symmetrization
similarity_matrix = (similarity_matrix_O + similarity_matrix_O.T) / 2.0

multipliers = list(np.arange(1.0, 2.3, 0.5))

list_of_list_of_errors = []
list_of_min_eig_scaling = []

z_range = [1, 2, 5, 10]

# eps=1e-16
Beispiel #9
0
from mlhmm import MLHMM
from itertools import product
from modshogun import StructuredAccuracy, HMSVMModel, SMT_TWO_STATE

### prepare training, test data and evaluator

distort = 40

train_data_file = 'hmsvm_%d_distort_data_fold' % distort
train_num_examples_fold = 20
train_num_folds = 5
train_labels, train_features = utils.unfold_data(train_data_file)

test_data_file = 'hmsvm_%d_distort_data_test' % distort
test_num_examples = 100
test_labels, test_features = utils.read_mat_file(test_data_file,
                                                 test_num_examples)

evaluator = StructuredAccuracy()

### train ML-HMM and evaluate in training data

print 'training ML-HMM'
model = HMSVMModel(train_features, train_labels, SMT_TWO_STATE)
model.set_use_plifs(True)
mlhmm = MLHMM(model)
mlhmm.train()
'''
print '\n\tmodel parameters:'
print '\t- transition scores: ' + str(numpy.exp(mlhmm.transition_scores))
print '\t- feature scores:'
for s,f in product(xrange(mlhmm.num_free_states), xrange(mlhmm.num_features)):
# number of features per example
num_features = 10
# the number different label values
num_states = 2

distorts = [0, 10, 20, 30, 40]

for distort in distorts:

    # K models that will contain the data of each fold
    models = []
    print '>>>> data with ' + str(distort) + '% of total labels distorted'
    data_file = 'hmsvm_%d_distort_data_fold' % distort
    for k in xrange(K):
        fold_data_file = '%s_%d' % (data_file, k)
        labels, features = utils.read_mat_file(fold_data_file,
                                               num_fold_examples)
        models.append(HMSVMModel(features, labels, SMT_TWO_STATE))

    # check
    if checks:
        print 'running checks on simulated data'
        for k in xrange(K):
            labels = models[k].get_labels()
            features = RealMatrixFeatures.obtain_from_generic(
                models[k].get_features())

            print '\tmodel %d with %d labels and %d features' % (
                k, labels.get_num_labels(), features.get_num_vectors())
            assert (labels.get_num_labels() == features.get_num_vectors())

            for i in xrange(labels.get_num_labels()):
# number of features per example
num_features = 10
# the number different label values
num_states = 2

distorts = [0, 10, 20, 30, 40]

for distort in distorts:

	# K models that will contain the data of each fold
	models = []
	print '>>>> data with ' + str(distort) + '% of total labels distorted'
	data_file = 'hmsvm_%d_distort_data_fold' % distort;
	for k in xrange(K):
		fold_data_file = '%s_%d' % (data_file, k)
		labels, features = utils.read_mat_file(fold_data_file, num_fold_examples)
		models.append(HMSVMModel(features, labels, SMT_TWO_STATE))

	# check
	if checks:
		print 'running checks on simulated data'
		for k in xrange(K):
			labels = models[k].get_labels()
			features = RealMatrixFeatures.obtain_from_generic(models[k].get_features())

			print '\tmodel %d with %d labels and %d features' % (k, labels.get_num_labels(),
				features.get_num_vectors())
			assert(labels.get_num_labels() == features.get_num_vectors())

			for i in xrange(labels.get_num_labels()):
				label = Sequence.obtain_from_generic(labels.get_label(i))
step = 50
runs_ = 3
"""
20ng2_new_K_set1.mat  oshumed_K_set1.mat  recipe_K_set1.mat  recipe_trainData.mat  twitter_K_set1.mat  twitter_set1.mat
"""
filetype = None
dataset = sys.argv[1]
if dataset == "PSD":
    feats = np.random.random((1000,1000))
    similarity_matrix = feats @ feats.T
    filetype = "numpy"
if dataset == "mrpc" or dataset == "rte" or dataset == "stsb":
    filename = "../GYPSUM/"+dataset+"_predicts_0.npy"
    filetype = "python"
if dataset == "twitter":
    similarity_matrix = read_mat_file(file_="./WordMoversEmbeddings/mat_files/twitter_K_set1.mat")
if dataset == "ohsumed":
    similarity_matrix = read_mat_file(file_="./WordMoversEmbeddings/mat_files/oshumed_K_set1.mat", version="v7.3")
if dataset == "recipe":
    similarity_matrix = read_mat_file(file_="/mnt/nfs/work1/elm/ray/recipe_trainData.mat", version="v7.3")
if dataset == "news":
    similarity_matrix = read_mat_file(file_="/mnt/nfs/work1/elm/ray/20ng2_new_K_set1.mat", version="v7.3")
if filetype == "python":
    similarity_matrix = read_file(filename)
# similarity_matrix = read_file("../GYPSUM/"+filename+"_predicts_0.npy")

true_error = []
KS_corrected_error_list = []
KS_ncorrected_error_list = []

scaling_error_list = []