def prepare_data(args): ''' you need to modify this dir path input dir denote the original data, which may be not divided we recommend that you need to write your own preprocess_data the target of preprocess data is to divide original data into train and val the test dir may be like the following data/ data_dir/ train/ class1/ class2/ ... classn/ val/ class1/ class2/ ... classn/ ''' input_dir = args.input_dir output_dir = args.output_dir ratio = args.train_test_ratio preprocess_data(input_dir, output_dir, ratio)
def get_data(f): file_name = f.split("/")[-1].split("\\")[-1].split('.')[0] dir = f.split(file_name)[0] input_color = get_color_array(f) height = input_color.shape[0] width = input_color.shape[1] input_shape = (height, width, 7) features_path = dir + file_name + '_features.txt' input_features = get_features_array(input_shape, features_path) grad = get_grad_array(dir, file_name) # preprocessing the data r = preprocess_data( input_color[:, :, :1], input_features, np.concatenate((grad[:, :, :1], grad[:, :, 3:4]), axis=-1), input_shape) g = preprocess_data( input_color[:, :, 1:2], input_features, np.concatenate((grad[:, :, 1:2], grad[:, :, 4:5]), axis=-1), input_shape) b = preprocess_data( input_color[:, :, 2:3], input_features, np.concatenate((grad[:, :, 2:3], grad[:, :, 5:6]), axis=-1), input_shape) return r, g, b
def get_testdata(): test_data_list = [] dir_list = [] colorfiles = glob(os.path.join(data_dir, '*.exr')) for f in colorfiles: scene = f.split("/")[-1].split("\\")[-1].split('.')[0] print(scene) input_color = get_color_array(f) height = input_color.shape[0] width = input_color.shape[1] input_shape = (height, width, 7) features_path = data_dir + scene + '_features.txt' input_features = get_features_array(input_shape, features_path) grad = get_grad_array(data_dir, scene) # preprocessing the data data = preprocess_data(input_color, input_features, grad, input_shape) outputDir = 'data/test/res/%s/' % scene if not os.path.exists(outputDir): os.makedirs(outputDir) writeEXR(input_color, (outputDir + 'origin.exr')) writeEXR(input_features[:, :, 0:3], (outputDir + 'normal.exr')) writeEXR(input_features[:, :, 4:7], (outputDir + 'albedo.exr')) writeEXR(grad[:, :, 0:3], (outputDir + 'gx.exr')) writeEXR(grad[:, :, 3:6], (outputDir + 'gy.exr')) test_data_list.append(data) # test_grad_list.append(grad) dir_list.append(outputDir) return test_data_list, dir_list
def run(input_file, test_file, k): clf = RandomForestClassifier(n_estimators=k) df = preprocess_data(input_file) X, label_dict, dict = extract_features(df) r, c = X.shape dft = preprocess_testdata(test_file) Xt, yt = extract_testfeatures(dft, label_dict, dict) clf.fit(X[:, 0:c - 1], X[:, c - 1]) z = clf.predict(Xt) print(accuracy_score(yt, z))
def test_tokenizer(): import util f = open("nltk_tokens.txt", "w+") polluter_tweets = "content_polluters_tweets.txt" tweets = util.load_data(fn=polluter_tweets, delimiter="\t", usecols=2) sentences = util.preprocess_data(tweets) dicti = {} for tokens in sentences: for token in tokens: if not token in dicti: dicti[token]=True for key in dicti.keys(): f.write(key.encode('utf-8') + "\n")
def main(country_code_file_path, twitter_data_path): """ :param country_code_file_path: the path of country_code_file :param twitter_data_path: the path of twitter_data """ program_start = time.time() # initialize communicator comm = MPI.COMM_WORLD comm_rank = comm.Get_rank() comm_size = comm.Get_size() language_code_dict = None # read country_code in master process if comm_rank == 0: dump_num_processor(comm_size) # read country_code info and broad cast language_code_dict = read_language_code_dict(country_code_file_path) # counting hash_tag hash_tag_count = Counter() language_code_count = Counter() # ********************************************************************************************** # Algorithm 1: parallel twitter file reading & processing # ********************************************************************************************** # calculating number of lines of data to be processed, line to start, line to end n_lines = comm.bcast(read_n_lines(twitter_data_path), root=0) lines_per_core = n_lines // comm_size # the total number of line to be read by the processor lines_to_end = n_lines + 1 # ignore first line # the index of the first line to be processed by the processor line_to_start = 1 + lines_per_core * comm_rank # ignore first line # the index of the last line to be processed by the processor line_to_end = line_to_start + lines_per_core if comm_rank == comm_size - 1: # last core to finish all remaining lines line_to_end = lines_to_end # processing lines in specified range: line_to_start <= line_number <= line_to_end for line_number, line in enumerate( read_data_line_by_line(twitter_data_path)): # ignore first line if line_number == line_to_end: break if line_number >= line_to_start: preprocessed_line = preprocess_data(line) if preprocessed_line: processing_data(preprocessed_line, hash_tag_count, language_code_count) # ********************************************************************************************** # Algorithm 2: parallel top-n calculation # ********************************************************************************************** n = 10 # a) concurrent calculating top n hash_tags, languages used if comm_size > 1: # 1) merge Counter from each processor reduced_language_code_count = comm.reduce(language_code_count, root=0, op=operator.add) reduced_hash_tag_count = comm.reduce(hash_tag_count, root=0, op=operator.add) # 2) split merged to each processor if comm_rank == 0: split_language_code_np_array = np.array_split( list(reduced_language_code_count.items()), comm_size) split_hash_tag_np_array = np.array_split( list(reduced_hash_tag_count.items()), comm_size) else: split_language_code_np_array = None split_hash_tag_np_array = None # 3) scatter merged to each processor local_language_code = list( map(lambda x: (x[0], int(x[1])), comm.scatter(split_language_code_np_array, root=0))) local_hash_tag = list( map(lambda x: (x[0], int(x[1])), comm.scatter(split_hash_tag_np_array, root=0))) # 4) merge each processor's top n calculation result reduced_language_code_count = comm.reduce(heapq.nlargest( n, local_language_code, lambda x: x[1]), root=0, op=merge_list) reduced_hash_tag_count = comm.reduce(heapq.nlargest( n, local_hash_tag, lambda x: x[1]), root=0, op=merge_list) # b) single processor calculating top n else: reduced_hash_tag_count = hash_tag_count.most_common(n) reduced_language_code_count = language_code_count.most_common(n) # output summary in root process if comm_rank == 0: dump_hash_tag_output(reduced_hash_tag_count) dump_country_code_output(reduced_language_code_count, language_code_dict) program_run_time = time.time() - program_start print("Programs runs {}(s)".format(program_run_time))
util.mkdirp('MODELS') # Load dataset and prepare data print('Loading dataset...') X_train, X_test, Y_train, Y_test = util.load_ABCDE_datasets( args.path, args.cv, args.lnoise, args.anoise) nb_classes = len(np.unique(Y_train)) img_rows = img_cols = int(math.sqrt(X_train.shape[1] / args.channels)) Yc_train = np_utils.to_categorical(Y_train, nb_classes) Yc_test = np_utils.to_categorical(Y_test, nb_classes) if args.pre != None and args.pre != 'None': X_train, scaler = util.preprocess_data(args.pre, X_train) X_test, scaler = util.preprocess_data(args.pre, X_test, scaler) X_train, X_test, input_shape = reshape(X_train, X_test, args.flat, args.channels, img_rows, img_cols) print('dbname:', dbname) print('full dbname:', fulldbname) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('nb_classes:', nb_classes) print('channels:', args.channels) print('input:', input_shape) print('preprocess:', args.pre) print('Label noise:', args.lnoise) print('Attribute noise:', args.anoise)
" Epochs {epochs}\n" " Base Learning Rate 2^{base_learning_rate}\n" " Learning Rate {learning_rate}\n" " Loss Scaling {loss_scaling}\n" " Weight Decay {weight_decay}\n") if not opts.no_validation: log_str += ("Validation Graph.\n" " Dataset {validation_data}\n" " Batch Size {validation_batch_size}\n") log_str += "Checkpoint Path {checkpoint_path}\n" print(log_str.format(**vars(opts))) # If the data is not already preprocessed, preprocess it if not opts.use_synthetic_data and not util.is_preprocessed(opts.datafolder): util.preprocess_data(opts.datafolder) print("Loading training data") opts.training_data = MLPData(opts, data_path=opts.training_data) print(f"Rows: {opts.training_data._size}") print("Loading evaluation data") opts.validation_data = MLPData(opts, data_path=opts.validation_data) print(f"Rows: {opts.validation_data._size}") # If using synthetic data, set the environment variable required if opts.use_synthetic_data: if 'TF_POPLAR_FLAGS' in os.environ: os.environ['TF_POPLAR_FLAGS'] += ' --use_synthetic_data --synthetic_data_initializer=random' else: os.environ['TF_POPLAR_FLAGS'] = '--use_synthetic_data --synthetic_data_initializer=random'
import sys import os import pickle import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.compose import ColumnTransformer from alphagan_class import AlphaGAN from util import dump_column_transformers, load_column_transformers, split_data, preprocess_data os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' if __name__ == '__main__': if len(sys.argv) > 1: train_df = pd.read_csv(sys.argv[1]) preprocess_data(train_df, './data/ranges.csv') X_train = train_df.to_numpy() ag = AlphaGAN() ag.train(X_train=X_train, epochs=4000, batch_size=32)
import util from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from baseline import BaselinePredictor from sklearn.svm import SVC data = util.load_data() preprocessed_data = util.preprocess_data(data) X, Y = util.splitFeaturesAndLabel(preprocessed_data, 'Empathy') X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42) baseline_predictor = BaselinePredictor() baseline_preds = util.trainAndPredict(X_train, Y_train, baseline_predictor, X_test) print("Baseline accuracy and classification report") util.printAccuracyAndClassficationReport(baseline_preds, Y_test, classes=['1', '2', '3', '4', '5']) X_train, X_test = util.getBestFeatures(X_train, Y_train, X_test) model = SVC(kernel='rbf') params = { 'C': [i for i in range(1, 11)],
from keras.losses import MeanAbsoluteError import keras.backend as K import tensorflow as tf import numpy as np if __name__ == '__main__': session = K.get_session() init = tf.global_variables_initializer() session.run(init) ag = AlphaGAN() ag.load_pretrained_models('./snapshots/3900_') test_normal_df = pd.read_csv('./data/test_set_normal.csv') preprocess_data(test_normal_df, './data/ranges.csv') test_abnomal_df = pd.read_csv('./data/test_set_abnomal.csv') preprocess_data(test_abnomal_df, './data/ranges.csv') X_1 = test_normal_df.to_numpy() X_2 = test_abnomal_df.to_numpy() Z_hat_1 = ag.encoder.predict(X_1) X_hat_1 = ag.generator.predict(Z_hat_1) Z_hat_2 = ag.encoder.predict(X_2) X_hat_2 = ag.generator.predict(Z_hat_2) rec_losses_normal = np.linalg.norm(np.subtract(X_1, X_hat_1), axis=1) rec_losses_fraud = np.linalg.norm(np.subtract(X_2, X_hat_2), axis=1)
import numpy as np import lbann import lbann.modules from util import preprocess_data # Data paths, directory where patches are located data_dir = 'data' samples = preprocess_data(data_dir) dims = len(samples[0]) num_classes = 3 num_channels = 14 # Sample access functions def get_sample(index): sample = samples[index] return sample def num_samples(): return samples.shape[0] def sample_dims(): return [dims] def str_list(l): return ' '.join([str(i) for i in l])
def main(): tf.flags.DEFINE_string( "output_dir", "/Users/iiskin/Documents/workspace/PreProduction/sentiment_beam/output", "Directory to export the model run results") tf.flags.DEFINE_string( "input_data_dir", "/Users/iiskin/Documents/workspace/PreProduction/sentiment_beam/data", "Path to directory containing training and testing data") tf.flags.DEFINE_string("version", "default", "Version of your model") tf.flags.DEFINE_boolean("transform_data", False, "Preprocess raw data") tf.flags.DEFINE_integer("vocab_size", 20000, "Vocabulary size") tf.flags.DEFINE_integer("train_batch_size", 1000, "Batch size for training") tf.flags.DEFINE_integer("train_num_epochs", 10000, "Number of epochs for training") tf.flags.DEFINE_integer("num_train_instances", 2000, "Number of training instances") tf.flags.DEFINE_integer("num_test_instances", 2000, "Number of test instances") tf.flags.DEFINE_string("delimiters", ".,!?() ", "Delimiters to be used in splitting text") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() raw_file_dir = os.path.join(FLAGS.input_data_dir, 'raw') raw_metadata_dir = os.path.join(raw_file_dir, 'metadata') train_neg_file_pattern = os.path.join(raw_file_dir, 'train/negative/*') train_pos_file_pattern = os.path.join(raw_file_dir, 'train/positive/*') test_neg_file_pattern = os.path.join(raw_file_dir, 'test/negative/*') test_pos_file_pattern = os.path.join(raw_file_dir, 'test/positive/*') transformed_file_dir = os.path.join(FLAGS.input_data_dir, 'transformed') transformed_metadata_dir = os.path.join(transformed_file_dir, 'metadata') transformed_train_file_pattern = os.path.join(transformed_file_dir, 'train/*') transformed_test_file_pattern = os.path.join(transformed_file_dir, 'test/*') temp_dir = os.path.join(FLAGS.output_dir, "tmp") #model_run_dir = os.path.join(FLAGS.output_dir, datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")) model_run_dir = os.path.join(FLAGS.output_dir, FLAGS.version) if not FLAGS.transform_data: if not os.path.exists(transformed_file_dir): raise Exception( "It doesn't look like the raw data has been transformed yet. Use transform_data flag to transform the raw data." ) else: shutil.rmtree(transformed_file_dir, ignore_errors=True) util.preprocess_data( train_neg_file_pattern=train_neg_file_pattern, train_pos_file_pattern=train_pos_file_pattern, test_neg_file_pattern=test_neg_file_pattern, test_pos_file_pattern=test_pos_file_pattern, transformed_train_file_pattern=transformed_train_file_pattern, transformed_test_file_pattern=transformed_test_file_pattern, transformed_metadata_dir=transformed_metadata_dir, raw_metadata_dir=raw_metadata_dir, transform_func_dir=model_run_dir, temp_dir=temp_dir, vocab_size=FLAGS.vocab_size, delimiters=FLAGS.delimiters) print( "\nRun \"tensorboard --logdir {}\" to see the results on Tensorboard\n\n" .format(FLAGS.output_dir)) learn_runner.run(experiment_fn=model.generate_experiment_fn( transformed_train_file_pattern=transformed_train_file_pattern, transformed_test_file_pattern=transformed_test_file_pattern, transformed_metadata_dir=transformed_metadata_dir, raw_metadata_dir=raw_metadata_dir, vocab_size=FLAGS.vocab_size, train_batch_size=FLAGS.train_batch_size, train_num_epochs=FLAGS.train_num_epochs, num_train_instances=FLAGS.num_train_instances, num_test_instances=FLAGS.num_test_instances), output_dir=model_run_dir)
def train_model(): file = 'train.data' util.preprocess_data()
input_shape=input_shape)) model.add(Conv2D(64, kernel_size=(3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(1, activation='sigmoid')) return model if __name__ == "__main__": os.chdir('..') path = os.getcwd() + 'dataset/Epileptic Seizure Recognition.csv' X, Y = preprocess_data(path) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) x_train = x_train.reshape(x_train.shape[0], imheight, imwidth, 1) x_test = x_test.reshape(x_test.shape[0], imheight, imwidth, 1) imheight, imwidth = (36, 54) input_shape = (imheight, imwidth, 1) model = construct_model(input_shape) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])