def main(): file_path = 'spam.csv' data = load_data(file_path) preprocess_data(data) X_pyth = [] y_pyth = [] for d in data: text = d['text'] entry = ( features.currency_count(text), features.url_count(text), features.word_count(text), features.longest_numerical_string(text), features.average_word_length(text), features.num_win_occurences(text), features.num_free_occurences(text) ) X_pyth.append(entry) if d['category'] == 'spam': y_pyth.append(0) else: y_pyth.append(1) X = np.array(X_pyth) y = np.array(y_pyth) # Randomly shuffle data p = np.random.permutation(len(y_pyth)) X = X[p] y = y[p] # Split into training and testing datasets X_train = X[0:4000] y_train = y[0:4000] X_test = X[4001:5571] y_test = y[4001:5571] model = DecisionTreeClassifier() model.fit(X_train, y_train) prediction = model.predict(X_test) accuracy = accuracy_score(y_test, prediction) matrix = confusion_matrix(y_test, prediction) binary = np.array(matrix) fig, ax = plot_confusion_matrix(conf_mat=binary) plt.show() print(accuracy)
def main(): file_path = 'spam.csv' data = load_data(file_path) preprocess_data(data) spams = [] hams = [] for d in data: if d['category'] == 'spam': spams.append(d['text']) else: hams.append(d['text']) plot_function_to_test('Number of "Free" Occurences', hams, spams)
def logistic_regression(): """build logistic regressor to predict survival label""" df, y, X = preprocessing.preprocess_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42) logreg = LogisticRegression() logreg.fit(X_train, y_train) y_pred = logreg.predict(X_test) return y_test, y_pred
def runSerial(fname, box_number, box_port, parameters): ## fname = file location ## box = what box you are using ## highprob = e.g. 45 or 90 ## flippinggamma = e.g. 15 ## rewardsize = 2 or 4 ## box = e.g. 2 or 4 ## stimulation = 0 or 1 ## protocol = highprob, flippinggamma, rewardsize, stimulation ## parameters = protocol e box number alla fine #print(fname) baud = '115200' #set frequency of communication, must be in agreement with arduino fmode = 'ab' #Opens a file for appending in binary format, begin the txt raw data port = serial.Serial(box_port, baud) # open(fname,fmode) as outf: technicality for pythin to write arduino's messages outf = open(fname, fmode) #open port serial.Serial(addr,baud): if port.isOpen() == False: port.open() print("WAIT FOR IT!!!!\n\n") time.sleep(2.5) #send parameters to start script: for parameter in parameters: time.sleep(0.5) signal = str(chr(int(parameter))).encode('ascii') port.write(signal) global stopper #Boolean array set on false for each box, #when is true it stops writing the txt file and close communication with arduino stopper[box_number] = False #Main loop: as long stopper is false python keep on writing arduino's messages while port.isOpen() & ~stopper[box_number]: if port.inWaiting() > 0: try: x = port.readline() if b'-666' in x: print("All is well in box %d!!!\n" % box_number) outf.write(x) outf.flush() except: print("Error in box %d!!!\n" % box_number) outf.write("Error") outf.flush() #When stopper is true Python exits the main loop and proceed to preprocessing # Call preprocessing module to get csv version of dataframe dataframe = preprocessing.preprocess_data( fname) #fname is the txt file where we are writing csv_fname = fname[:-4] + '.csv' #change extension to create the name for the csv dataframe.to_csv(csv_fname) #save the preprocess data in a csv file print("I'm done in box %d!!!\n" % box_number)
def predict(self, X_test, verbose=0): if not self._ensemble: print("You must train the net first") return X_test, _, _ = preprocess_data(X_test, [], self._models[0]._nb_classes, img_rows=self._models[0]._img_rows, img_cols=self._models[0]._img_cols, verbose=verbose) return self._ensemble.predict_classes([np.asarray(X_test)] * len(self._models))
def render_visual(input): workweek,weekend = preprocess_data(DATA) if input == 'workweek': data = workweek else: data = weekend fig = px.scatter_mapbox(data,lat='lat',lon='long', color='Proportion Of Bikes Available',animation_frame='Hour', hover_name='name',color_continuous_scale=px.colors.sequential.Blues, zoom=12) fig.update_traces(marker=dict(size=15), selector=dict(mode='markers')) return fig
def evaluate(self, X_test, y_test, verbose=0): X_test, y_test, _ = preprocess_data(X_test, y_test, self._models[0]._nb_classes, img_rows=self._models[0]._img_rows, img_cols=self._models[0]._img_cols, verbose=verbose) print('Evaluating ensemble') score = self._ensemble.evaluate([np.asarray(X_test)] * len(self._models), y_test, verbose=verbose) print('Test accuracy:', score[1] * 100, '%') print('Test error:', (1 - score[2]) * 100, '%')
def create_model(file_path=FINAL_MLKNN_MODEL_FILE_PATH): """ Creates and trains a MLkNN classifier using the optimized parameters found Saves this trained model to disk :param string file_path: specifies where the model should be saved :return: a trained sklearn MLkNN classifier """ with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file: hyperparameters = json.load(file)['hyperparameters'] question_data, music_data = preprocessing.load_data() question_data, music_data = preprocessing.preprocess_data( question_data, music_data) clf = MLkNN(k=hyperparameters['k'], s=hyperparameters['s']) clf.fit(question_data.values, music_data.values) pickle.dump(clf, open(file_path, 'wb')) return clf
def create_model(file_path=FINAL_XGBOOST_MODEL_FILE_PATH): """ Creates and trains a OneVsRestClassifier(XGBClassifier()) using the optimized parameters found Saves this trained model to disk :param string file_path: specifies where the model should be saved :return: a trained OneVsRestClassifier """ with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file: hyperparameters = json.load(file)['hyperparameters'] question_data, music_data = preprocessing.load_data() question_data, music_data = preprocessing.preprocess_data( question_data, music_data) xgb_model = XGBClassifier(**hyperparameters) xgb_clf = OneVsRestClassifier(xgb_model, n_jobs=-1) xgb_clf.fit(question_data, music_data) pickle.dump(xgb_clf, open(file_path, 'wb')) return xgb_clf
def load_data(self, debug=False): """Loads starter word-vectors and train/dev/test-split the data.""" # Load the training set X, y, self.word_to_num, self.tag_to_num = preprocess_data( dir_path='NKJP_1.2_nltk_POS') self.num_to_word = invert_dict(self.word_to_num) self.num_to_tag = invert_dict(self.tag_to_num) self.tagset_size = len(self.tag_to_num) self.X_train, self.X_dev, self.y_train, self.y_dev = train_test_split( X, y, test_size=0.2) # A hacky way to get 3-part split from 2-part-splitting function self.X_dev, self.X_test, self.y_dev, self.y_test = train_test_split( self.X_dev, self.y_dev, test_size=0.5) if debug: self.X_train = self.X_train[:1024] self.y_train = self.y_train[:1024] self.X_dev = self.X_dev[:1024] self.y_dev = self.y_dev[:1024]
import feature_engineering as feat from preprocessing import preprocess_data from decision_function import validate_model, train_model # MAIN PROGRAM *************************************************************** SEPERATOR = '==========================================' acc_knn, acc_svm, acc_dt, acc_df, acc_mlp = 0, 0, 0, 0, 0 ppv_knn, ppv_svm, ppv_dt, ppv_rf, ppv_mlp = 0, 0, 0, 0, 0 # LOAD THE DATA ************************************************************** ticker = input("Enter a ticker symbol: ") data = csv_to_df(ticker) # PREPROCESSING & FEATURE ENGINEERING **************************************** preprocess_data(data) feat.moving_average(data, 10, 'Close') feat.moving_average(data, 30, 'Close') feat.moving_average(data, 20, 'Volume') # DETERMINING THE BEST DECISION FUNCTION ************************************* # KNN Classifier ------------------------------------------------------------- print(SEPERATOR) print('KNN Classifier:') # Test the model acc_knn, ppv_knn, k = validate_model(data, 'KNN') # Store the model for making predictions knn_model = train_model(data, 'KNN', k) # Identify the Test Population X = data.iloc[0:, 1:-2]
from data.utils import load_data from preprocessing import preprocess_data from visualization import plot_learning_curves, get_errors_input from metrics import custom_map_at_k from feature_selection import get_features_extractor from data_augmentation import augment_data print('Augmenting training data set') augment_data('train.csv', 'train_augmented.csv') print('Loading training and testing set') train_data = load_data('train_augmented.csv') test_data = load_data('test.csv') print('Preprocessing') X_train, Y_train = preprocess_data(train_data) X_test, Y_test = preprocess_data(test_data) model_name = 'lr' # print('Loading model') # model = joblib.load('./models/' + model_name + '_classifier.pkl') print('Fitting model') model = Pipeline([ ('features', get_features_extractor()), ('LogisticRegression', LogisticRegression()) ]) model.fit(X_train, Y_train) print('Saving model') joblib.dump(model, './models/' + model_name + '_classifier.pkl')
def main(): parser = argparse.ArgumentParser() parser.add_argument("--path", help="address of file", type=str) parser.add_argument("--batch_size", help="batch_size", type=int, default=12) parser.add_argument("--embedding_size", help="dimension of vectors", default=300, type=int) parser.add_argument("--lr", type=float, help="learning rate", default=1e-5) parser.add_argument("--decay", help="L2 loss", type=float, default=1e-2) parser.add_argument("--iterator", type=int, help="number of iteration", default=10) args = parser.parse_args() data = load_pickle(args.path) context = data["context"] question = data["question"] answer = data["answer"] cxt = [] query = [] ans = [] for c, q, a in zip(context, question, answer): cxt.append(c.lower()) query.append(q.lower()) ans.append(a.lower()) cxt = tokenize(cxt) query = tokenize(query) ans = tokenize(ans) word2idx, idx2word = make_dictionary(cxt, query, ans) query_ix = convert2idx(query, word2idx) context_ix = convert2idx(cxt, word2idx) answer_ix = convert2idx(ans, word2idx) ##preprocess data q_data, c_data, a_data, start_index, end_index = preprocess_data( query_ix, context_ix, answer_ix) train_data = makeBatch(q_data, c_data, start_index, end_index) train_loader = DataLoader(train_data, collate_fn=pad_sequence, batch_size=args.batch_size) ################################################################################################ ## train model = BIDAF( embedder=WordEmbedder(args.embedding_size, len(word2idx)), encoder=Encoder(args.embedding_size, args.embedding_size), attention_flow=AttentionFlow(), modeling_layer=ModelingLayer(d_vector=args.embedding_size, bidirectional=True), output_layer=OutputLayer(d_vector=args.embedding_size)).to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.decay) train(model, args.iterator, optimizer, criterion, train_loader)
def mainFunc(argv): def printUsage(): print('main.py -n <num_cores> -x <experiment>') print( 'num_cores = Number of cores requested from the cluster. Set to -1 to leave unset' ) print( 'experiment = experiment setup that should be executed. Set to A, B or C' ) num_cores = -1 num_epochs = NUM_EPOCHS experiment = "" # Command line argument handling try: opts, args = getopt.getopt(argv, "n:x:", ["num_cores=", "experiment="]) except getopt.GetoptError: printUsage() sys.exit(2) for opt, arg in opts: if opt == '-h': printUsage() sys.exit() elif opt in ("-n", "--num_cores"): num_cores = int(arg) elif opt in ("-x", "--experiment"): if arg in ("A", "B", "C"): experiment = arg else: printUsage() sys.exit(2) print("Executing experiment {} with {} CPU cores".format( experiment, num_cores)) if num_cores != -1: # We set the op_parallelism_threads in the ConfigProto and pass it to the TensorFlow session configProto = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=num_cores, intra_op_parallelism_threads=num_cores) else: configProto = tf.ConfigProto(log_device_placement=False) print("Building graph") graph = None # Experiment C required double the default hidden state size state_size = CELL_SIZE if experiment == "C": state_size = 2 * CELL_SIZE graph = build_training_graph(state_size=state_size, downproject_cellsize=CELL_SIZE) sentences, index_2_word, word_2_index, _ = preprocessing.preprocess_data( TRAINING_DATA_PATH, max_sentence_length=MAX_SENTENCE_LENGTH, vocabulary_size=VOCABULARY_SIZE) sentences_array = np.array(sentences) print("Sentences shape is {}".format(sentences_array.shape)) print("Training network") # Use word2vec only for experiment B and C useWord2Vec = False if experiment in ("B", "C"): useWord2Vec = True t = time.time() train_network( graph, sentences_array, checkpoint_filename="exp{}".format(experiment), num_epochs=num_epochs, configProto=configProto, state_size=state_size, vocabulary=word_2_index, # used in load_embeddings method useWord2Vec=useWord2Vec) # if True, uses word2vec embedding print("It took {} seconds to train for {} epochs.".format( time.time() - t, num_epochs))
# -*- coding: utf-8 -*- """ Created on Fri May 5 14:06:14 2017 @author: Serotonin """ import preprocessing fname = 'C:\\Users\\Serotonin\\Google Drive\\Flipping\\run_task_photo\\raw_data\\DN3_170818a.txt' dataframe = preprocessing.preprocess_data(fname) csv_fname = fname[:-4] + '.csv' dataframe.to_csv(csv_fname)
solver="saga", n_jobs=-1), parameters, scoring="accuracy", n_jobs=-1, cv=5) else: print("classifier should be svm or logreg") print("loading data...") train_tweets, train_labels = load_data() test_tweets = load_test() if args.prepro: print("preprocess data...") test_prepro, train_prepro = preprocess_data(test_tweets, train_tweets) else: test_prepro, train_prepro = test_tweets, train_tweets print(f"load {args.embedding} embeddings") try: embeddings = load_pickle(file) except FileNotFoundError: print( "The specified embedding cannot be found, run build_embeddings.py first" ) exit() print("embedd data...") test_embedded, train_embedded = embed_data(test_prepro, train_prepro, embeddings)
def main(): parser = create_parser() args = parser.parse_args() if args.setup: create_directories() if args.debug: dataset = DATASETS['debug'] args.dataset = "debug" features, _, labels, _ = preprocess_data(args.patch_size, args.distribution, dataset=dataset) #print(features, 'debug') #print("length of features: ",type(features), len(features),'element.shape: ',features[0][0]) features_train, features_test = features[:100], features[100:120] labels_train, labels_test = labels[:100], labels[100:120] elif args.train_model or args.evaluate_model or args.preprocess_data: dataset = DATASETS[args.dataset] #print(dataset.values()) load_from_cache = not args.preprocess_data try: features_train, features_test, labels_train, labels_test = preprocess_data( args.patch_size, args.distribution, dataset=dataset, only_cache=load_from_cache) #print(features_train, 'train_model or evaluate_model or preprocess_data') print("Length of features_train: ", len(features_train)) except IOError: print("Cache file does not exist. Please run again with -p flag.") sys.exit(1) if args.visualise: visualise_labels(labels_train, args.patch_size, LABELS_DIR) visualise_labels(labels_test, args.patch_size, LABELS_DIR) if not args.model_id: timestamp = time.strftime("%d_%m_%Y_%H%M") model_id = "{}_{}_{}".format(timestamp, args.dataset, args.architecture) else: model_id = args.model_id if args.init_model or args.train_model or args.evaluate_model: model_dir = os.path.join(OUTPUT_DIR, model_id) save_makedirs(model_dir) # Hyperparameters for the model. Since there are so many of them it is # more convenient to set them in the source code as opposed to passing # them as arguments to the Command Line Interface. We use a list of tuples instead of a # dict since we want to print the hyperparameters and for that purpose # keep them in the predefined order. hyperparameters = [ ("architecture", args.architecture), # Hyperparameters for the first convolutional layer. ("nb_filters_1", 64), ("filter_size_1", 9), ("stride_1", (2, 2)), # Hyperparameter for the first pooling layer. ("pool_size_1", (2, 2)), # Hyperparameter for the second convolutional layer (when # two layer architecture is used). ("nb_filters_2", 128), ("filter_size_2", 5), ("stride_2", (1, 1)), # Hyperparameters for Stochastic Gradient Descent. ("learning_rate", 0.05), ("momentum", 0.9), ("decay", 0.0) ] hyperparameters_mnih = [ ("architecture", args.architecture), # Hyperparameters for the first convolutional layer. ("nb_filters_1", 64), ("filter_size_1", 16), ("stride_1", (4, 4)), # Hyperparameter for the first pooling layer. ("pool_size_1", (2, 2)), ("pool_stride", 1), # Hyperparameter for the second convolutional layer). ("nb_filters_2", 112), ("filter_size_2", 4), ("stride_2", (1, 1)), # Hyperparameter for the third convolutional layer). ("nb_filters_3", 80), ("filter_size_3", 3), ("stride_3", (1, 1)), # Hyperparameters for Stochastic Gradient Descent. ("learning_rate", 0.05), ("momentum", 0.9), ("decay", 0.0) ] if args.init_model: model = init_model(args.patch_size, model_id, **dict(hyperparameters_mnih)) save_model_summary(hyperparameters_mnih, model, model_dir) elif args.train_model or args.evaluate_model: hyperparameters = dict(hyperparameters_mnih) model = load_model(model_id) model = compile_model(model, hyperparameters["learning_rate"], hyperparameters['momentum'], hyperparameters["decay"]) if args.train_model: model = train_model(model, features_train, labels_train, args.patch_size, model_id, model_dir, nb_epoch=args.epochs, checkpoints=args.checkpoints, tensorboard=args.tensorboard, earlystop=args.earlystop) if args.evaluate_model: evaluate_model(model, features_test, labels_test, args.patch_size, model_dir, out_format=args.out_format)
outlet_dim_output = (base_path / "../data/destination/outlet_data.csv") test_output = (base_path / "../data/algo_files/test.csv") df_raw = pd.read_csv(file_input) print(df_raw.head()) # create connection if __name__ == "__main__": ##################################### # staging/transform area # ##################################### # preprocess the data df_preprocessed = preprocessing.preprocess_data(df_raw) ##################################### # transform & load # ##################################### # Create star schema and load to destination star_schema.create_star_schema( df_preprocessed, **{ "fact_output": fact_output, "item_dim_output": item_dim_output, "outlet_dim_output": outlet_dim_output }) ##################################### # Analysis #
train_data = pd.read_csv(path_train, sep='|', usecols=['orth', 'translation']).values.tolist() val_data = pd.read_csv(path_dev, sep='|', usecols=['orth', 'translation']).values.tolist() test_data = pd.read_csv(path_test, sep='|', usecols=['orth', 'translation']).values.tolist() # training data src_lang, trg_lang, pairs_train, len_train, trainDF = preprocess_data( train_data, lang1, lang2, 'train', False) w2i_in = src_lang.word2index i2w_in = src_lang.index2word w2c_in = src_lang.word2count w2i_out = trg_lang.word2index i2w_out = trg_lang.index2word w2c_out = trg_lang.word2count unk_value = False pairs_train, src_lang, trg_lang = fix_vocabulary(pairs_train, src_lang, trg_lang, 'train', unk=unk_value)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-path", help="address of file", type=str) parser.add_argument("-batch_size", help="batch_size", type=int, default=12) parser.add_argument("-embedding_size", help="dimension of vectors", default=300, type=int) parser.add_argument("-lr", type=float, help="learning rate", default=1e-5) parser.add_argument("-decay", help="L2 loss", type=float, default=1e-4) parser.add_argument("-iterator", type=int, help="number of iteration", default=10) parser.add_argument("-num_iters", type=int, help="decoder iteration", default=4) args = parser.parse_args() data = load_pickle(args.path) context = data["context"][:100] question = data["question"][:100] answer = data["answer"][:100] cxt = [] query = [] ans = [] for c, q, a in zip(context, question, answer): cxt.append(c.lower()) query.append(q.lower()) ans.append(a.lower()) cxt = tokenize(cxt) query = tokenize(query) ans = tokenize(ans) word2idx, idx2word = make_dictionary(cxt, query, ans) query_ix = convert2idx(query, word2idx) context_ix = convert2idx(cxt, word2idx) answer_ix = convert2idx(ans, word2idx) ##preprocess data q_data, c_data, a_data, start_index, end_index = preprocess_data( query_ix, context_ix, answer_ix) train_data = makeBatch(q_data, c_data, start_index, end_index) train_loader = DataLoader(train_data, collate_fn=pad_sequence, batch_size=args.batch_size) ################################################################################################ ## train dynamicN = DynamicCN(d_model=args.embedding_size, vocab_size=len(word2idx), iters=args.num_iters) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(dynamicN.parameters()) train(model=dynamicN, iterator=2, optimizer=optimizer, criterion=criterion, train_loader=train_loader)
# Load Hyperparameters epochs = params['epochs'] batch_size = params['batch_size'] rnn_size = params['rnn_size'] num_layers = params['num_layers'] encoding_embedding_size = params['encoding_embedding_size'] decoding_embedding_size = params['decoding_embedding_size'] learning_rate = params['learning_rate'] learning_rate_decay = params['learning_rate_decay'] min_learning_rate = params['min_learning_rate'] keep_probability = params['keep_probability'] # Preprocess data, get the vocabularies questions, answers = get_data() sorted_questions, sorted_answers, questionswords2int, answerswords2int = preprocess_data( questions, answers) # Splitting the questions and answers into training and validation sets training_validation_split = int(len(sorted_questions) * 0.15) training_questions = sorted_questions[training_validation_split:] training_answers = sorted_answers[training_validation_split:] validation_questions = sorted_questions[:training_validation_split] validation_answers = sorted_answers[:training_validation_split] # Training batch_index_check_validation_loss = ( (len(training_questions)) // batch_size // 2) - 1 total_training_loss_error = 0 list_validation_loss_error = [] early_stopping_check = 0
from pandas import read_csv from preprocessing import preprocess_data, balance_data from sklearn.metrics import make_scorer from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from tpot import TPOTClassifier X_train = read_csv('input/aps_failure_training_set.csv',na_values='na') X_test = read_csv('input/aps_failure_test_set.csv',na_values='na') # deal with missing values and constant features and normalize X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test) print(f'Data loaded: {len(X_train)} training observations, {len(X_test)} testing observations') X_train, y_train = balance_data(X_train, y_train, n_samples = 2500) print(f'Balanced training data ({2500/1000}/1): {len(X_train)} training observations, {len(X_test)} testing observations') # A custom scorer function is created in order to reflect on the different cost of misclassification (fn > fp) def scania_scorer(y_true,y_pred): tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() total_cost = 10*fp + 500*fn return total_cost custom_scania_scorer = make_scorer(scania_scorer, greater_is_better=False) tpot = TPOTClassifier(generations=100, population_size=100, verbosity=3, random_state=42, use_dask=True, n_jobs=-1, memory='auto', early_stop=10, scoring=custom_scania_scorer) tpot.fit(X_train, y_train) y_pred = tpot.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print("Total cost: " + str(scania_scorer(y_test, y_pred)))
#This file is for running preprocessing only import logging import sys from preprocessing import preprocess_data import yaml logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logger = logging.getLogger() #paths video_root_path = '/content/drive/MyDrive/Grad Project/data/UCSD' dataset = 'UCSDped1' #time_length with open('config.yml', 'r') as ymlfile: cfg = yaml.load(ymlfile) t = cfg['time_length'] #run preprocessing preprocess_data(logger, dataset, t, video_root_path)
intent = file.read().strip().split("\n") intent_dict = {} for i, word in enumerate(intent): intent_dict[word] = i # read data from datafile df = pd.read_csv("datafile.csv", header=0, delimiter="\t", quoting=3) # load word2vec model model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin",binary = 'True') # preprocess data_X data_x = preprocess_data(df,model) print("*************") # onehot encode data_y data_y = np.array(df["intent"]) for i, word in enumerate(data_y): data_y[i] = intent_dict[word] data_y = np.array(data_y, dtype=np.int8) nb_classes = len(intent_dict) data_y = np.eye(nb_classes)[data_y] # split into train and test train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.3, random_state=42)
from learning import QLearning from preprocessing import preprocess_data data = preprocess_data() learner = QLearning(data) learner.learn()
from preprocessing import preprocess_data from model import define_discriminator, define_generator, define_gan from training import train from evaluation import evaluate, get_fsl_metrics, resp_vec_correlation, plot_corr from util.tf_session import setup_tf_session #%% Setup data and models # Setup the tf session for possible gpu usage setup_tf_session() dataDir = "data" # Preprocess data print("Step 0: Preprocessing data...\t", end="", flush=True) preprocess_data(dataDir) print("Completed!\n") # Load data print("Step 1: Loading and extracting data...\n") print("Dataset - TRAIN") dataset_train, train_subjects = data_prep(os.path.join(dataDir, "preprocessed"), True, "train") print("Dataset - TEST") dataset_test, test_subjects = data_prep(os.path.join(dataDir, "preprocessed"), True, "test") image_shape = dataset_train[0].shape[1:] image_shape = (image_shape[0], image_shape[1], 1) print("Completed data loading!\n")
# Load features x_tr, x_te, y_tr = load_data( features_folder=features_folder, data_folder=data_folder ) # Pre-processing if use_preprocessing: preprocessing_steps = [LowVarianceFeaturesRemover(), CenterScaler()] else: preprocessing_steps = None x_tr, x_te, groups_tr, _ = preprocess_data( x_tr, x_te, preprocessing_steps=preprocessing_steps ) # Classification clf = classify( est=est_list[est_name], x_tr=x_tr.values, y_tr=y_tr.values.ravel(), groups_tr=groups_tr.values, x_te=x_te.values, test_index=x_te.index, perform_evaluation=perform_evaluation, perform_cross_validation=perform_cross_validation, cv_params=cv_params[est_name],
'/', data_folder=data_folder) est_list[est_name].set_params(metric='precomputed') elif est_name[:3] == 'CDF': x_tr, x_te, y_tr = load_data(features_folder=cdf_folder, data_folder=data_folder) else: x_tr, x_te, y_tr = load_data(features_folder=isi_folder, data_folder=data_folder) # Pre-process preprocessing_steps = [] resampling_steps = [RandomUnderSampler()] x_tr, x_te, groups_tr, y_tr = preprocess_data( x_tr, x_te, y_tr=y_tr, preprocessing_steps=preprocessing_steps, resampling_steps=resampling_steps) # Pre-sort the values to speed-up distance computation if not use_precomputed: if est_name in ['KS']: x_tr.iloc[:, :] = np.sort(x_tr.values, axis=1) x_te.iloc[:, :] = np.sort(x_te.values, axis=1) # Classification clf = classify(est=est_list[est_name], x_tr=x_tr.values, y_tr=y_tr.values.ravel(), groups_tr=groups_tr.values, x_te=x_te.values,
# -*- coding: utf-8 -*- """ Created on Tue Oct 29 10:12:01 2019 @author: tothp """ import ml import numpy as np import preprocessing # prepare frequency data data = preprocessing.preprocess_data("C:\\projects\\gwas") frequencies = data["genotype"].count_alleles(data["allele_names"], standardize_alleles=True) x = frequencies.loc[:, frequencies.columns != "UME_name"].to_numpy() #%% plot variance along positions var = np.std(data["genotype"], axis=0) # train frequency network # prepare snp data # train snp network
else: ax2.annotate('READY', xy=(0.1, 0.5)) if __name__ == '__main__': ordered_keys = sorted(preprocessing.boxes.keys()) n_boxes = len(ordered_keys) f, axarr = plt.subplots(n_boxes, 2, gridspec_kw={'width_ratios': [3, 1]}) fnames = ['x', 'x', 'x', 'x', 'x'] while True: try: df = pd.read_csv(preprocessing.csv_address) fnames = df.name except: pass for i in range(n_boxes): box_number = ordered_keys[i] if fnames[box_number] != 'x': try: data = preprocessing.preprocess_data(fnames[box_number]) if n_boxes > 1: plotter(axarr[i, 0], data, 60) water(axarr[i, 1], data) else: plotter(axarr[0], data, 60) water(axarr[1], data) except: pass plt.show() plt.pause(0.05)