if not standalone: spark_context = SparkContext(appName="GMM-MLE-example") num_classes = 50 #X_train,Y_train,X_test,Y_test = generate_datasets.generate_multivariate_normals( 5, 2, 150, 50, 5.0, 2.0 ) X_train, Y_train, X_test, Y_test = machine_learning.generate_datasets.generate_multivariate_normals( num_classes, 7, 25000, 5000, 15.0, 12.0) #X_train,Y_train = load_samples( dataset_filename ) #os.makedirs( base_dir+'/log', exist_ok=True ) #os.makedirs( base_dir+'/models', exist_ok=True ) mle = machine_learning.MLE(covar_type=covar_type, dim=X_train.shape[1], log_dir=base_dir + '/log', models_dir=base_dir + '/models', batch_size=500) if spark_context is not None: samples = spark_context.parallelize(X_train, slices) samples.persist() mle.fit_with_spark(spark_context=spark_context, samples=samples, max_components=max_components) samples.unpersist() spark_context.stop() else: mle.fit_standalone(samples=X_train, max_components=max_components)
print getCurrentDateTimeString() + " - we are working with " + str(samples.count()) + " blocks of approximately " + str(clust_batch_size) + " samples" # Shows an example of shape of the elements in the temporary RDD of blocks of samples print getCurrentDateTimeString() + " - " + str(samples.first().shape) # Gets the dimensionality of samples in order to create the object of the class MLE. dim_x = samples.first().shape[1] # Models and Logs Directories Creation createDirectoryIfNotExists(absoluteclusteringFullLogDir) createDirectoryIfNotExists(absoluteClusteringFullModelsDirName) # Delete previus executions data deleteDirectoryData(absoluteclusteringFullLogDir) deleteDirectoryData(absoluteClusteringFullModelsDirName) # Create MLE class mle = machine_learning.MLE(covar_type = 'full', dim = dim_x, log_dir = absoluteclusteringFullLogDir, models_dir = absoluteClusteringFullModelsDirName) # Fit clusters mle.fit_with_spark(spark_context = spark_context, samples = samples, max_components = clustFullMaxComponents ) samples.unpersist() spark_context.stop() # Get current time to monitorize execution time executionEndTime = time.time() if verbose: print getExecutionTimeMsg(executionStartTime, executionEndTime)
# Shows an example of each element in the temporary RDD of blocks of samples print(samples.first()) print(type(samples.first())) samples.persist() print("we are working with %d blocks of approximately %d samples " % (samples.count(), batch_size)) # Shows an example of shape of the elements in the temporary RDD of blocks of samples print(samples.first().shape) # Gets the dimensionality of samples in order to create the object of the class MLE. dim_x = samples.first().shape[1] mle = machine_learning.MLE(covar_type=covar_type, dim=dim_x, log_dir=base_dir + '/log', models_dir=base_dir + '/models') mle.fit_with_spark(spark_context=spark_context, samples=samples, max_components=max_components) samples.unpersist() spark_context.stop() else: X_train, Y_train = load_samples(dataset_filename) dim_x = 0 if type(X_train) == list: dim_x = X_train[0].shape[1] elif type(X_train) == numpy.ndarray: dim_x = X_train.shape[1]
# Models and Logs Directories Creation createDirectoryIfNotExists(auxLogDirName) createDirectoryIfNotExists(auxModelsDirName) # Delete previus executions data deleteDirectoryData(auxLogDirName) deleteDirectoryData(auxModelsDirName) if verbose: print getCurrentDateTimeString( ) + " - Working with " + covarType + " matrix covariance type" # Create MLE class mle = machine_learning.MLE(covar_type=covarType, dim=dim_x, log_dir=auxLogDirName, models_dir=auxModelsDirName) try: # Fit clusters mle.fit_with_spark(spark_context=spark_context, samples=samples, max_components=reclust_max_components) except Exception: print getCurrentDateTimeString( ) + " - An exception has been thrown" samples.unpersist() spark_context.stop()
print getCurrentDateTimeString() + " - we are working with " + str(samples.count()) + " blocks of approximately " + str(clust_batch_size) + " samples" # Shows an example of shape of the elements in the temporary RDD of blocks of samples print getCurrentDateTimeString() + " - " + str(samples.first().shape) # Gets the dimensionality of samples in order to create the object of the class MLE. dim_x = samples.first().shape[1] # Models and Logs Directories Creation createDirectoryIfNotExists(absoluteclusteringLogDir) createDirectoryIfNotExists(absoluteClusteringModelsDirName) # Delete previus executions data deleteDirectoryData(absoluteclusteringLogDir) deleteDirectoryData(absoluteClusteringModelsDirName) # Create MLE class mle = machine_learning.MLE(covar_type = clust_covar_type, dim = dim_x, log_dir = absoluteclusteringLogDir, models_dir = absoluteClusteringModelsDirName) # Fit clusters mle.fit_with_spark(spark_context = spark_context, samples = samples, max_components = clust_max_components ) samples.unpersist() spark_context.stop() # Get current time to monitorize execution time executionEndTime = time.time() if verbose: print getExecutionTimeMsg(executionStartTime, executionEndTime)
X = pf.transform(X) N = int(0.8 * len(X)) X_train = X[:N] Y_train = Y[:N] X_test = X[N:] Y_test = Y[N:] use_gmm = True if use_gmm: K = 74 if K is None: mle = machine_learning.MLE(covar_type='diagonal', dim=X.shape[1], log_dir='meteo.1/log', models_dir='meteo.1/models') mle.fit_standalone(samples=X_train, max_components=250, batch_size=10) else: gmm = machine_learning.GMM() gmm.load_from_text(filename='meteo.1/models/gmm-%04d.txt' % K) mean_per_class = numpy.zeros([K, Y_train.shape[1]]) denominator = numpy.zeros(K) for t in range(len(X_train)): posteriors, logL = gmm.posteriors(X_train[t]) mean_per_class += numpy.outer(posteriors, Y_train[t]) denominator += posteriors mean_per_class /= denominator.reshape(-1, 1) y_predict = numpy.zeros([len(Y_test), Y_test.shape[1]])