def train_with_generator(path, file, file_test, output, epochs, batch_size, checkpoint, shuffle=False, fast=False, sample_rate=16000): # prepare data x_train, y_train = prepare_data(file) x_test, y_test = prepare_data(file_test) num_classes = len(list(set(y_train))) print('nombre de classes : ', num_classes) #x_train, x_test, y_train , y_test = train_test_split(X, Y, test_size=0.2)#, stratify=y) train_generator = batch_generator_shuffle(batch_size, x_train, y_train, load_data_with_mel_spectrogram, n_mels=40, transpose=True, data_aug=True, proba_data_aug=0.7, coeff_amplitude=True, coeff_time=4000, fast=fast, new_sample_rate=sample_rate) test_generator = batch_generator(batch_size, x_test, y_test, load_data_with_mel_spectrogram, n_mels=40, transpose=True, data_aug=True, proba=0.7, coeff_amplitude=True, coeff_time=4000, fast=fast, new_sample_rate =sample_rate) step_train = math.ceil(len(x_train)/batch_size) print('step train :', step_train) step_test = math.ceil(len(x_test)/batch_size) print('shape:', len(x_train)) print('shape:', len(x_test)) print('step train :' , step_train) print('step test :' , step_test) # network model = get_model2((32, 40), num_classes, 3, 'relu')#vgg_style((99,161), num_classes) model.summary() sgd = keras.optimizers.SGD(lr=0.01, decay=1e-5, momentum=0.9, nesterov=True) #adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['categorical_accuracy']) # # with open(output+'.json','w') as f: # json_string = model.to_json() # f.write(json_string) # callback callback_tensorboard = keras.callbacks.TensorBoard(log_dir='./logs/'+output, histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) # -{epoch:02d} reduce = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) checkpoints = ModelCheckpoint(output+'.hdf5', verbose=1, save_best_only=True, period=checkpoint, save_weights_only=False) callbacks_list = [callback_tensorboard, checkpoints, reduce] # train model.fit_generator(train_generator, steps_per_epoch=step_train, epochs=epochs, verbose=1, validation_data=test_generator, validation_steps=step_test, callbacks=callbacks_list)
def main(training_file, test_file, submission_file, ratio): data = utilities.read_file(training_file) test_data = utilities.read_file(test_file) print 'Preparing data...' x, y = preprocess.prepare_data(data) refid, x_test = preprocess.prepare_test_data(test_data) x, x_test = preprocess.preprocess_features(x, x_test) print 'Feature extracting...' x, x_test = feature_extraction.create_feature(x, y, x_test) indices = feature_extraction.get_best_k_feature_indices(x, y, 300) x = feature_extraction.get_best_k_features(x, indices) x_test = feature_extraction.get_best_k_features(x_test, indices) print 'Get %s features.' % len(x[0]) x_train, x_cv, y_train, y_cv = cross_validation.train_test_split( x, y, test_size=.3, random_state=0) x_train, y_train = preprocess.down_sample(x_train, y_train, ratio) clf = classification.random_forest(x_train, y_train, x_cv, y_cv) print 'Predicting...' predict = clf.predict_proba(x_test) utilities.write_submission_file(submission_file, refid, predict)
def train_word2vec(): cv = KFold(n_splits=2, shuffle=True) X, y, _ = prepare_data(dataset_path=dataset_path) cv_data = cv.split(X) results = [] prediction_list = [] fold_count =1 # hyperparameters for word2vec most_common_words= [] for traincv, testcv in cv_data: print("\n--------Fold {}--------\n".format(fold_count)) # get the train and test from the dataset. X_train, X_test, y_train, y_test = X.iloc[traincv], X.iloc[testcv], y.iloc[traincv], y.iloc[testcv] train_essays = X_train['essay'] #print("y_train",y_train) test_essays = X_test['essay'] #y_train = torch.tensor(y_train,dtype=torch.long) train_sentences = [] for essay in train_essays: # get all the sentences from the essay train_sentences.append(essay_to_wordlist(essay, remove_stopwords = True)) # word2vec embedding print("Converting sentences to word2vec model") model,_ = build_word2vec(train_sentences, num_workers, num_features, min_word_count, context, downsampling) top10 = collections.defaultdict(int) trainDataVecs = np.array(getAvgFeatureVecs(train_sentences, model, num_features)) test_sentences = [] for essay_v in test_essays: test_sentences.append(essay_to_wordlist(essay_v, remove_stopwords=True)) testDataVecs = np.array(getAvgFeatureVecs(test_sentences, model, num_features)) trainDataVectors = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1])) testDataVectors = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1])) lstm_model = get_model(Hidden_dim1=Hidden_dim1, Hidden_dim2=Hidden_dim2, return_sequences=return_sequences, dropout=dropout, recurrent_dropout=recurrent_dropout, input_size=input_size, activation=activation, model_name=model_name, optimizer=optimizer, loss_function=loss_function) lstm_model.fit(trainDataVectors, y_train, batch_size=batch_size, epochs=epoch) y_pred = lstm_model.predict(testDataVectors) y_pred = np.around(y_pred) np.nan_to_num(y_pred) result = cohen_kappa_score(y_test.values, y_pred, weights='quadratic') print("Kappa Score: {}".format(result)) results.append(result) fold_count += 1 print("Average kappa score value is : {}".format(np.mean(np.asarray(results))))
def evaluate_accuracy(sess, acc, x, x_mask, y, eval_data, eval_labels, mb, maxlen): """ Accuracy evaluation """ N_samples = len(eval_data) acc_mb = np.zeros(N_samples // mb) for j in trange(len(acc_mb)): # Sample data idx = np.arange(j*mb, (j+1)*mb) _x, _x_mask = preprocess.prepare_data(idx, eval_data, maxlen) _y = preprocess.prepare_labels(idx, eval_labels) # Test feed_dict = {x: _x, x_mask: _x_mask, y: _y} acc_mb[j] = sess.run(acc, feed_dict=feed_dict) acc_avg = np.mean(acc_mb) return acc_avg
def main(): # Setup configuration class config = TrainConfig() # Load dataset iterator train_iter, test_iter, config = preprocess.prepare_data(config) config.display() # Setup and build coarse2fine training inference attn = Attention enc = Encoder dec = Decoder model = Seq2Seq(enc, dec,).to(config.DEVICE) # Initialize network -> Load the pretrained embeddings onto our model ## pretrained_embeddings = quote.vocab.vectors ## model.embedding.weight.data.copy_(pretrained_embeddings) # initialize the model to a special initialization, and calculate the trainable parameter model.apply(init_weights_base) print(colored(f'The model has {count_parameters(model):,} trainable parameters'),'red') # Initialize the loss function and create an optimizer criterion = nn.CrossEntropyLoss(ignore_index=config.TRG_PAD_IDX) optimizer = optim.Adam(model.parameters()) # Save vocabulary at last # Start training if config.K_FOLD: pass else: for epoch in range(N_EPOCHS): best_model, best_epach =
def build_visualization(): cv = KFold(n_splits=2, shuffle=True) X, y = prepare_data(dataset_path=dataset_path) cv_data = cv.split(X) results = [] prediction_list = [] fold_count = 1 # hyperparameters for word2vec most_common_words = [] for traincv, testcv in cv_data: top10 = collections.defaultdict(int) print("\n--------Fold {}--------\n".format(fold_count)) # get the train and test from the dataset. X_train, X_test, y_train, y_test = X.iloc[traincv], X.iloc[ testcv], y.iloc[traincv], y.iloc[testcv] train_essays = X_train['essay'] #print("y_train",y_train) test_essays = X_test['essay'] #y_train = torch.tensor(y_train,dtype=torch.long) train_sentences = [] for essay in train_essays: # get all the sentences from the essay train_sentences += essay_to_sentences(essay, remove_stopwords=True) # word2vec embedding print("Converting sentences to word2vec model") model, sorted_dic = build_word2vec(train_sentences, num_workers, num_features, min_word_count, context, downsampling) for k, v in sorted_dic[:10]: print("----------most_similar_word_for:" + str(k) + "--------------") print(model.wv.most_similar(k)) top10 = collections.defaultdict(int) tsne_plot(model)
def train_bert_sets(): warnings.filterwarnings('ignore') ## Sets experiment BERT data, target, sets = prepare_data(dataset_path=dataset_path) warnings.filterwarnings('ignore') set_count = 1 all_sets_score = [] for s in sets: print("\n--------SET {}--------\n".format(set_count)) X = s y = s['domain1_score'] cv = KFold(n_splits=5, shuffle=True) cv_data = cv.split(X) results = [] prediction_list = [] fold_count = 1 cuda = torch.device('cuda') # For DistilBERT: model_class, tokenizer_class, pretrained_weights = ( ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased') ## Want BERT instead of distilBERT? Uncomment the following line: ##model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased') # Load pretrained model/tokenizer tokenizer = tokenizer_class.from_pretrained(pretrained_weights) model = model_class.from_pretrained(pretrained_weights) with torch.cuda.device(cuda): for traincv, testcv in cv_data: torch.cuda.empty_cache() print("\n--------Fold {}--------\n".format(fold_count)) # get the train and test from the dataset. X_train, X_test, y_train, y_test = X.iloc[traincv], X.iloc[ testcv], y.iloc[traincv], y.iloc[testcv] train_essays = X_train['essay'] # print("y_train",y_train) test_essays = X_test['essay'] # model = model.cuda() # y_train = torch.tensor(y_train,dtype=torch.long) sentences = [] tokenize_sentences = [] train_bert_embeddings = [] # bert_embedding = BertEmbedding() # for essay in train_essays: # # get all the sentences from the essay # sentences += essay_to_sentences(essay, remove_stopwords = True) # sentences = pd.Series(sentences) # print(train_essays) tokenized_train = train_essays.apply( (lambda x: tokenizer.encode( x, add_special_tokens=True, max_length=200))) tokenized_test = test_essays.apply((lambda x: tokenizer.encode( x, add_special_tokens=True, max_length=200))) ## train max_len = 0 for i in tokenized_train.values: if len(i) > max_len: max_len = len(i) padded_train = np.array([ i + [0] * (max_len - len(i)) for i in tokenized_train.values ]) attention_mask_train = np.where(padded_train != 0, 1, 0) train_input_ids = torch.tensor(padded_train) train_attention_mask = torch.tensor(attention_mask_train) with torch.no_grad(): last_hidden_states_train = model( train_input_ids, attention_mask=train_attention_mask) train_features = last_hidden_states_train[0][:, 0, :].numpy() ## test max_len = 0 for i in tokenized_test.values: if len(i) > max_len: max_len = len(i) padded_test = np.array([ i + [0] * (max_len - len(i)) for i in tokenized_test.values ]) attention_mask_test = np.where(padded_test != 0, 1, 0) test_input_ids = torch.tensor(padded_test) test_attention_mask = torch.tensor(attention_mask_test) with torch.no_grad(): last_hidden_states_test = model( test_input_ids, attention_mask=test_attention_mask) test_features = last_hidden_states_test[0][:, 0, :].numpy() train_x, train_y = train_features.shape test_x, test_y = test_features.shape trainDataVectors = np.reshape(train_features, (train_x, 1, train_y)) testDataVectors = np.reshape(test_features, (test_x, 1, test_y)) lstm_model = get_model(Hidden_dim1=Hidden_dim1, Hidden_dim2=Hidden_dim2, return_sequences=return_sequences, dropout=dropout, recurrent_dropout=recurrent_dropout, input_size=input_size, activation=activation, loss_function=loss_function, optimizer=optimizer, model_name=model_name) history = lstm_model.fit(trainDataVectors, y_train, batch_size=batch_size, epochs=epoch) plot_accuracy_curve(history) y_pred = lstm_model.predict(testDataVectors) y_pred = np.around(y_pred) # y_pred.dropna() np.nan_to_num(y_pred) # evaluate the model result = cohen_kappa_score(y_test.values, y_pred, weights='quadratic') print("Kappa Score: {}".format(result)) results.append(result) fold_count += 1 tf.keras.backend.clear_session() all_sets_score.append(results) print("Average kappa score value is : {}".format( np.mean(np.asarray(results)))) set_count += 1
def main(): print("Document classification for Multi-domain sentimental dataset") basefile = 'sorted_data' # folder containing 342k data if args.PREP_DATA == True: # prepare data print('cleaning the train data') cleaned_data = prepare_data(basefile) print('saving the prepared data') prepared_data_list = cleaned_data #[train_cleaned_data, train_labels, train_num_words, dictionary, test_cleaned_data, test_labels, test_num_words] with open("preprocessed_multisent_data.txt", "wb") as f: pickle.dump(prepared_data_list, f) else: print('Loading the prepared data') with open("preprocessed_multisent_data.txt", "rb") as f: prepared_data_list = pickle.load(f) cleaned_data = prepared_data_list #********************** # Just for doing K fold CV dataX = [] dataY = [] for d in cleaned_data: dataX.append(d[1]) dataY.append(d[0]) #********************** start = time.time() skf = StratifiedKFold( n_splits=args.K_FOLD, shuffle=True, random_state=None) # Change random_state to 'int', act as a seed skf.get_n_splits(dataX, dataY) print(skf) k = 0 valid_loss = [] valid_auc = [] # Run for these many batches (ref Alg 2) num_batches = args.NUM_BATCHES batch_size = args.BATCH_SIZE if batch_size % 4 != 0: print( "********POSSIBLE ERROR, as the data may not be properly divisble over all the GPUs " ) learning_rate = args.lr accuracy_data = [] loss_plot = {} auc_plot = {} if args.DO_CV == True: for train_index, valid_index in skf.split(dataX, dataY): loss_plot[k] = [] auc_plot[k] = [] #******************** get the train and test dataset train_cleaned_data, train_labels, train_num_words, dictionary, valid_cleaned_data, valid_labels, valid_num_words\ = get_CV_data(cleaned_data, train_index, valid_index) print('value counts for valid set') bc = np.bincount(train_labels) ii = np.nonzero(bc)[0] print(np.vstack((ii, bc[ii])).T) print('Getting different weights for cost sensitive learning') weight_vector = cost_sensitive_weights(train_labels) train_cleaned_data = numpy_fillna(train_cleaned_data) # TRUNC_LENGTH = int(2*np.mean(train_num_words)) TRUNC_LENGTH = int(np.mean(train_num_words)) #+ 50 # TRUNC_LENGTH = int(np.median(train_num_words))# + 50 print('Trucation length, done for vectorization of code', TRUNC_LENGTH) train_cleaned_data, train_num_words = truncate_data( train_cleaned_data, train_num_words, TRUNC_LENGTH) # max_words in a Document VOCAB_SIZE = int(len(dictionary)) + 1 #*************************************************** # Re-initialise the model # initialise the parameters of the model if args.MODEL_NAME is 'CoNN': model = CoNN(args.MODEL_NAME, args.hilbert_DIM, args.WORD2VEC, VOCAB_SIZE, TRUNC_LENGTH, dropout_p=args.DROPOUT, USE_CUDA=args.USE_CUDA, num_CLASS=args.num_CLASS) else: print("model not present") if args.USE_CUDA == True: model = model.cuda( ) #:TODO: Does this work!! maybe put NN modules on GPU... print('WRAPPING around DataParallel.. ') model = nn.DataParallel(model) if args.MODEL_NAME is 'CoNN' and args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), \ eps=1e-08, weight_decay=0) print('Adding Scheduler') scheduler = MultiStepLR( optimizer, milestones=[1000, 1200, 1500, 1800, 2400, 3000, 3500], gamma=0.5) print('K_fold = ', k) for t in range(1, num_batches + 1): # change to number of epochs\ scheduler.step() if t % 100 == 0: # print learning rate after every 100 epochs for param_group in optimizer.param_groups: print('lr ', param_group['lr']) #****************************************************************************** #**************************TRAINING******************************************** #****************************************************************************** # sample docs from train_cleaned_data Xs, ys, Xs_num_words, batch_weight_vector = vectorized_sample_data(train_cleaned_data, train_labels,\ train_num_words, batch_size, weight_vector) optimizer.zero_grad() # zero the gradient buffers output = model.forward(Xs, Xs_num_words, args.ITERATIONS) # DROPOUT of p loss = F.binary_cross_entropy_with_logits( output.view(len(ys)), ys, weight=batch_weight_vector) # NOTE: use weights for cost sensitive learning # calculating the accuracy ys_actual = ys.data.cpu().numpy() scores_pred = output.data.cpu().numpy() auc = get_auc(ys_actual, scores_pred) print('time since start ', time_since(start, float(t)/float(num_batches)),'(%d %d%%)'\ %(t, float(t)/float(num_batches)*100 ), 'loss', loss.data.cpu().numpy(), ' auc ', auc) loss_plot[k].append(loss.data.cpu().numpy()) auc_plot[k].append(auc) loss.backward() optimizer.step() #****************************************************************************** #****************************END*********************************************** #****************************************************************************** #****************************INTERMEDIATE EVALUATION*************************** #****************************************************************************** if args.EVALUATE_TEST == True and t % args.EVALUATE_EVERY == 0: # evaluate the model after every 100 iterations if t == args.EVALUATE_EVERY: valid_cleaned_data = numpy_fillna(valid_cleaned_data) valid_cleaned_data, valid_num_words = truncate_data(valid_cleaned_data, valid_num_words,\ TRUNC_LENGTH) Xt, yt, Xt_num_words, _ = vectorized_sample_data(valid_cleaned_data, \ valid_labels, valid_num_words, 'dummy_len(valid_index)',\ TESTING_FLAG=True, weight_vector='dummy') # Hoping that it runs for all the validation set at once!! # NOTE: maintaining the same batch size for validation too output = [] num_valid_batches = int(np.ceil(len(yt) / batch_size)) for b in range(num_valid_batches): if b == num_valid_batches - 1: batch_output = model.forward(Xt[batch_size*b:, :],\ Xt_num_words[batch_size*b:], args.ITERATIONS, TEST_FLAG=1) output.append(batch_output.data.cpu().numpy()) # NOTE: default dropout is '0' for valid and test case! (as we need to take the ensemble of DNNs) else: batch_output = model.forward(Xt[batch_size*b:batch_size*(b+1), :],\ Xt_num_words[batch_size*b:batch_size*(b+1)], args.ITERATIONS, TEST_FLAG=1) output.append(batch_output.data.cpu().numpy()) # NOTE: default dropout is '0' for valid and test case! (as we need to take the ensemble of DNNs) output = list(itertools.chain.from_iterable(output)) total_valid_output = Variable( torch.from_numpy(np.array(output)).type( torch.cuda.FloatTensor)) loss = F.binary_cross_entropy_with_logits( total_valid_output.view(len(yt)), yt, weight=None) # calculating the accuracy yt_actual = yt.data.cpu().numpy() scores_pred = output #.data.cpu().numpy() auc = get_auc(yt_actual, scores_pred) print('Valid loss for k =', k, ' Iteration num ', t, '\n loss ', \ loss.data.cpu().numpy()[0], '\n auc ', auc) #****************************************************************************** #*****************************END********************************************** loss_plot[k] = np.vstack(loss_plot[k]) loss_plot[k] = loss_plot[k][:, 0] # NOTE: maintaining the same batch size for validation too #****************************************************************************** #**************************VALIDATION****************************************** #****************************************************************************** print('checking loss on validation set') valid_cleaned_data = numpy_fillna(valid_cleaned_data) valid_cleaned_data, valid_num_words = truncate_data(valid_cleaned_data,\ valid_num_words, TRUNC_LENGTH) # max_words in a Document Xs, ys, Xs_num_words, _ = vectorized_sample_data(valid_cleaned_data,\ valid_labels, valid_num_words, 'dummy_len(valid_index)',\ TESTING_FLAG=True, weight_vector='dummy') # Hoping that it runs for all the validation set at once!! output = [] num_valid_batches = int(np.ceil(len(ys) / batch_size)) for b in range(num_valid_batches): if b == num_valid_batches - 1: batch_output = model.forward(Xs[batch_size*b:, :],\ Xs_num_words[batch_size*b:], args.ITERATIONS, TEST_FLAG=1) output.append(batch_output.data.cpu().numpy()) # NOTE: default dropout is '0' for valid and test case! (as we need to take the ensemble of DNNs) else: batch_output = model.forward(Xs[batch_size*b:batch_size*(b+1), :],\ Xs_num_words[batch_size*b:batch_size*(b+1)], args.ITERATIONS, TEST_FLAG=1) output.append(batch_output.data.cpu().numpy()) # NOTE: default dropout is '0' for valid and test case! (as we need to take the ensemble of DNNs) output = list(itertools.chain.from_iterable(output)) total_valid_output = Variable( torch.from_numpy(np.array(output)).type( torch.cuda.FloatTensor)) loss = F.binary_cross_entropy_with_logits(total_valid_output.view( len(ys)), ys, weight=None) ys_actual = ys.data.cpu().numpy() scores_pred = output #.data.cpu().numpy() auc = get_auc(ys_actual, scores_pred) print('Valid loss for k =', k, ' is ', loss, '\n', ' auc ', auc) valid_loss.append(loss.data.cpu().numpy()) # loss.data valid_auc.append(auc) #****************************************************************************** #**************************VALIDATION end************************************** #****************************************************************************** k += 1 valid_loss = np.vstack(valid_loss) valid_loss = valid_loss[:, 0] print('K-fold valid loss ', valid_loss) print('K-fold auc ', valid_auc) print('avg auc ', np.array(valid_auc).mean(), ' std dev', np.array(valid_auc).std()) with open('cv_results_multisent', 'wb') as fp: pickle.dump([valid_loss, valid_auc, loss_plot, auc_plot], fp) print('saving the model') torch.save(model, 'CoNN_model_multisent.pt') return
# ----------------- THE STEPS BELOW ARE SPECIFIC TO EACH DATA SET ----------------- # ----------------- WRITE YOUR OWN data2tsv() and prepare_custom() FUNCTIONS # ----------------- THE STEPS BELOW ARE SPECIFIC TO EACH DATA SET ----------------- if to_tsv == True: # convert raw data to input tsv format # raw_corpus2tsv.xml2tsv("../test/") raw_corpus2tsv.xml2tsv("/gpfs/data/datasci/paper-m/raw/hansard_xml/") if custom_prep == True: # do data prep custom to hansard text = preprocess.prepare_custom(data_dt) # ----------------- THE STEPS ABOVE ARE SPECIFIC TO EACH DATA SET ----------------- if prepare_data == True: # preprocess tsv data for topic modeling preprocess.prepare_data(text) if mallet_import == True: # load the mallet module os.system("module load mallet/2.0.8rc3") mallet.imprt() if topic_model == True: # load the mallet module os.system("module load mallet/2.0.8rc3") # import preprocessed data to mallet objects and train LDA model for topic in n_topics: mallet.lda(n_topics) if rank_documents == True: # rank documents by chosen topic(s)
import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' import tensorflow as tf import preprocess as pr import mathfunc as mf import random import keras import numpy as np from keras import optimizers from keras.models import Sequential from keras.layers import Activation, Dense import matplotlib.pyplot as plt import learn from keras.models import model_from_json from keras.models import load_model json_file = open('model_0.json', 'r') model = model_from_json(json_file.read()) model.load_weights('model_0.h5') data = pr.prepare_data('online_shoppers_intention.csv', 12330, 18) inputs = np.asarray(pr.extractInputs(data)) targets = np.asarray(pr.extractTargets(data)) predictedOutput = model.predict(np.array([inputs[0]])) print('Real output : ', targets[0]) print('Predicted output : ', predictedOutput[0])
parser.add_argument( '--crop', default=1.0, type=float, help="If 1 no crop, if 0.25 crop 25%% from top and bottom") parser.add_argument('--tokenize', default=True, type=str2bool) # set false to read pretokenized data parser.add_argument('--save_models', default=True, type=str2bool) args = parser.parse_args() # args, unknown = parser.parse_known_args() # use this version in jupyter notebooks to avoid conflicts init_random_seeds(args.seed) # if args.prepare: prepare_data(args) df_train, df_dev, df_test = read_files(args) # automatically identify the number of labels: num_labels = len( np.union1d(np.union1d(df_train['label'], df_dev['label']), df_test['label'])) args.num_labels = num_labels logging.info('Identified {} labels in the dataset.'.format(num_labels)) if args.tokenize: train_data, dev_data, test_data = tokenize_data( args, df_train, df_dev, df_test) else: train_data, dev_data, test_data = read_tokenized_data(
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--path", type=str, default="dataset/^NSEI (3).csv", help="path of csv file" ) parser.add_argument( "--trading_days", type=int, default=30, help="Number of trading days" ) parser.add_argument( "--no_of_subsamples", type=int, default=1, help="Number of samples to take from csv file", ) parser.add_argument( "--kernel", type=str, default="rbf", help="the kernel for SVM", choices=["linear", "rbf", "poly", "custom", "cobb-douglas"], ) parser.add_argument( "--degree", type=int, default=3, help="value of p in polynomial/custom kernel" ) parser.add_argument( "--C", type=float, default=1.0, help="the regularisation parameter for SVM" ) parser.add_argument( "--gamma", type=float, default=1.0, help="the inner product coefficient in polynomial kernel", ) parser.add_argument( "--coef0", type=float, default=0.0, help="coefficient for polynomial kernel" ) parser.add_argument( "--train_test_ratio", type=float, default=0.75, help="fraction of train samples" ) args = parser.parse_args() path = args.path trading_days = args.trading_days no_of_subsamples = args.no_of_subsamples kernel = args.kernel degree = args.degree C = args.C gamma = args.gamma if kernel == "poly": gamma = 1.0 if kernel == "rbf": gamma = "scale" coef0 = args.coef0 train_test_ratio = args.train_test_ratio print("Details: ") print("Extracting data from: " + str(path)) print("Trading Days: " + str(trading_days)) print("Number of Subsamples: " + str(no_of_subsamples)) if kernel == "poly": assert ( gamma == 1.0 ), "Polynomial should have gamma=1.0, otherwise it is Cobb-Douglas Kernel" print("Kernel: polynomial") print("Degree: " + str(degree)) elif kernel == "cobb-douglas": assert ( gamma != 1.0 ), "Cobb-Douglas should have gamma!=1.0, otherwise it is Polynomial Kernel" print("Kernel: cobb-douglas") print("Gamma: " + str(gamma)) print("Degree: " + str(degree)) kernel = "poly" elif kernel == "custom": print("Kernel: custom") print("Degree: " + str(degree)) else: print("Kernel: " + str(kernel)) print("Regularisation Parameter, C: " + str(C)) # define the custom kernels def poly_cobb_kernel(X, Y): return gamma * (np.dot(X, Y.T)) ** degree def custom_kernel(X, Y): return 1 / (1 + np.dot(X, Y.T) ** degree) df = load_csv(path) data = prepare_data(data_f=df, horizon=trading_days, alpha=0.9,) # remove the output from the input features = [x for x in data.columns if x not in ["gain"]] dataA = np.array_split(data[features], no_of_subsamples) # print(dataA) train_acc, train_prec, train_recall, train_f1 = (0, 0, 0, 0) test_acc, test_prec, test_recall, test_f1 = (0, 0, 0, 0) t0 = time.time() stats = [] for i in tqdm(range(no_of_subsamples)): features = [x for x in data.columns if x not in ["gain", "pred"]] X = dataA[i][features] y = dataA[i]["pred"] print((y == 1).sum()) print((y == 0).sum()) X_train = X[: int(train_test_ratio * len(X))] y_train = y[: int(train_test_ratio * len(y))] X_test = X[int(train_test_ratio * len(X)) :] y_test = y[int(train_test_ratio * len(y)) :] if kernel == "custom": clf = make_pipeline( StandardScaler(), SVC( kernel=custom_kernel, C=C, class_weight="balanced", cache_size=100000, ), ) elif kernel == "poly": clf = make_pipeline( StandardScaler(), SVC( kernel=poly_cobb_kernel, C=C, class_weight="balanced", cache_size=100000, ), ) else: clf = make_pipeline( StandardScaler(), SVC( kernel=kernel, C=C, degree=degree, coef0=coef0, gamma=gamma, class_weight="balanced", cache_size=100000, ), ) clf.fit(X_train, y_train) metrics = compute_acc(clf, X_train, y_train, X_test, y_test) stats.append(metrics) train_acc += metrics["training"][0] train_prec += metrics["training"][1] train_recall += metrics["training"][2] train_f1 += metrics["training"][3] test_acc += metrics["test"][0] test_prec += metrics["test"][1] test_recall += metrics["test"][2] test_f1 += metrics["test"][3] # print(metrics["training"][4]) # print(metrics["test"][4]) print("\nTime taken: " + str((time.time() - t0) / 60) + " minutes") for i in range(no_of_subsamples): print("Stats for Subsample#" + str(i + 1)) print("Training Accuracy:\t" + str(stats[i]["training"][0])) print("Training Precision:\t" + str(stats[i]["training"][1])) print("Training Recall:\t" + str(stats[i]["training"][2])) print("Training F1:\t\t" + str(stats[i]["training"][3])) print("\n") print("Test Accuracy:\t\t" + str(stats[i]["test"][0])) print("Test Precision:\t\t" + str(stats[i]["test"][1])) print("Test Recall:\t\t" + str(stats[i]["test"][2])) print("Test F1:\t\t" + str(stats[i]["test"][3])) print("\n") print("Average Results") print("Average Training Accuracy:\t" + str(train_acc / no_of_subsamples)) print("Average Training Precision:\t" + str(train_prec / no_of_subsamples)) print("Average Training Recall:\t" + str(train_recall / no_of_subsamples)) print("Average Training F1:\t\t" + str(train_f1 / no_of_subsamples)) print("\n") print("Average Test Accuracy:\t\t" + str(test_acc / no_of_subsamples)) print("Average Test Precision:\t\t" + str(test_prec / no_of_subsamples)) print("Average Test Recall:\t\t" + str(test_recall / no_of_subsamples)) print("Average Test F1:\t\t" + str(test_f1 / no_of_subsamples))
max_val_accuracy = 0. max_test_accuracy = 0. with tf.Session() as sess: # Initialize learnable weights, embeddings sess.run([init_op, W_init_op], feed_dict={W_emb: _W_emb}) # Summary writer if args["log_tfevents"]: writer = tf.summary.FileWriter(os.path.join(output_dir, "TF_logs"), sess.graph) for i in trange(args["train_iters"]): # Sample minibatch idx = np.random.choice(N_train, args["mb"], replace=False) _x, _x_mask = preprocess.prepare_data(idx, data["train"], args["MAX_LEN"]) _y = preprocess.prepare_labels(idx, labels["train"]) # Run one train step feed_dict = {x: _x, x_mask: _x_mask, y: _y} if args["log_tfevents"]: _, _sum = sess.run([train_step, merged_sum], feed_dict=feed_dict) writer.add_summary(_sum, i) else: sess.run(train_step, feed_dict=feed_dict) # Validate if i % args["validation_rate"] == 0: val_accuracy = aux_functions.evaluate_accuracy( sess, accuracy, x, x_mask, y, data["valid"], labels["valid"], args["mb"], args["MAX_LEN"])
from encoderRNN import EncoderRNN from attnDecoderRNN import AttnDecoderRNN from training import train_iters from stackRNN import SRNN_Softmax # from evaluate import evaluate, evaluateRandomly # run from root dir # e.g., python seq2seq/main.py --data data/dataset_len100.tsv --length 100 if __name__ == "__main__": parser = argparse.ArgumentParser(description="Seq2seq") parser.add_argument("--data", "-d", help="Path to datafile") parser.add_argument("--length", "-l", help="Length of max sequence") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_lang, output_lang, pairs = prepare_data( os.path.normpath(parser.parse_args().data), "infix", 'postfix') max_length = round(int(parser.parse_args().length) * 1.1) hidden_size = 256 # encoder1 = EncoderRNN(input_lang.n_words, hidden_size, device).to(device) # Number of hidden units n_hidden = 256 # Number of hidden layers n_layers = 1 # Stack size stack_size = 104 stack_dim = 1 vocab_size = input_lang encoder1 = SRNN_Softmax(n_hidden, vocab_size, vocab_size, n_layers, stack_size, stack_dim) attn_decoder1 = AttnDecoderRNN(n_hidden, output_lang.n_words, max_length,
def is_eq(pred_indices, label_vec): for idx in pred_indices: if label_vec[idx] == 1: return 1 return 0 attr_lables = { '/m/02gy9n': 0, #Transparent '/m/05z87': 1, #Plastic '/m/0dnr7': 2, #(made of)Textile '/m/04lbp': 3, #(made of)Leather '/m/083vt': 4 #Wooden } img_id, id_bbox_dict, id_labels = preprocess.prepare_data(attr_lables) feature_path = Path('features/') feature_path_list = list(feature_path.glob('*.npy')) feature_path_names = list(range(0,len(feature_path_list)-1)) shuffle(feature_path_names) index = (8 * len(feature_path_names)) // 10 train_feature_path_idx = feature_path_names[:index] val_feature_path_idx = feature_path_names[index:] attr_classifier = torch.load('output/trained_model_15_0.0001_4999') attr_classifier.eval() total_accuracy = 0 softmax = nn.Softmax()
from sklearn import svm from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler import input_output import models import preprocess # Define constants TEST_SIZE_SAMPLE = 0.25 RANDOM_STATE_SPLIT = 40 rawData = input_output.load_data("train.csv") processedData = preprocess.prepare_data(rawData) # Separate features and output + scale data trainData = (processedData[:, 1:]) predOutput = processedData[:, 0] X_train, X_test, y_train, y_test = train_test_split(trainData, predOutput, test_size=TEST_SIZE_SAMPLE, random_state=RANDOM_STATE_SPLIT) # Try to load classifier from file clf = input_output.load_classifier("titanicCLF.pkl") if not clf: # If no file is present, train the classifier using the best known parameters and save the classifier print("There is no saved classifier!") print("Training Model...") clf = svm.SVC(C=1, kernel="rbf", gamma=0.1)
def tuner(args, f, writer): path = args.path trading_days = args.trading_days kernel = args.kernel degree = args.degree C = [float(i) for i in args.C] gamma = args.gamma if kernel == "poly": gamma = 1.0 if kernel == "rbf": gamma = "scale" coef0 = args.coef0 train_test_ratio = args.train_test_ratio folds = args.folds # print("Details: ") # print("Extracting data from: " + str(path)) # print("Trading Days: " + str(trading_days)) if kernel == "poly": assert ( gamma == 1.0 ), "Polynomial should have gamma=1.0, otherwise it is Cobb-Douglas Kernel" # print("Kernel: polynomial") # print("Degree: " + str(degree)) elif kernel == "cobb-douglas": assert ( gamma != 1.0 ), "Cobb-Douglas should have gamma!=1.0, otherwise it is Polynomial Kernel" # print("Degree in pipe1: " + str(degree)) kernel = "poly" def poly_cobb_kernel(X, Y): return gamma * (np.dot(X, Y.T))**degree def custom_kernel(X, Y): return 1 / (1 + np.dot(X, Y.T)**degree) # load the dataset df = load_csv(path) data = prepare_data(data_f=df, horizon=trading_days, alpha=0.9) # print(data) # remove the output from the input features = [x for x in data.columns if x not in ["gain"]] dataA = np.array_split(data[features], args.no_of_subsamples) features = [x for x in data.columns if x not in ["gain", "pred"]] X = np.array(dataA[0][features]) y = np.array(dataA[0]["pred"]) #tscv = TimeSeriesSplit(n_splits=folds) param_grid = {"svc__C": C} # print("\n") metrics = {} for C in param_grid["svc__C"]: metrics[C] = {"accuracy": [], "precision": [], "recall": [], "f1": []} i = 0 for i in tqdm(range(args.no_of_subsamples)): features = [x for x in data.columns if x not in ["gain", "pred"]] X = dataA[i][features] y = dataA[i]["pred"] # print("Fold #" + str(i + 1)) # print("\n") X_train = X[:int(train_test_ratio * len(X))] y_train = y[:int(train_test_ratio * len(y))] X_test = X[int(train_test_ratio * len(X)):] y_test = y[int(train_test_ratio * len(y)):] X_train = rem_inf(X_train) X_test = rem_inf(X_test) # print(np.where(np.isinf(X_train))) # print(np.where(np.isinf(X_test))) if kernel == "custom": clf = make_pipeline( StandardScaler(), SVC( kernel=custom_kernel, C=C, class_weight="balanced", cache_size=100000, ), ) elif kernel == "poly": clf = make_pipeline( StandardScaler(), SVC( kernel=poly_cobb_kernel, C=C, class_weight="balanced", cache_size=100000, ), ) else: clf = make_pipeline( StandardScaler(), SVC( kernel=kernel, degree=degree, C=C, coef0=coef0, gamma=gamma, class_weight="balanced", cache_size=100000, ), ) clf.fit(X_train, y_train) # print("Training Report:") y_train_pred = clf.predict(X_train) train_res = classification_report(y_train, y_train_pred, output_dict=True) # print(classification_report(y_train, y_train_pred)) # print("\n") # print("Test Report:") y_test_pred = clf.predict(X_test) test_res = classification_report(y_test, y_test_pred, output_dict=True) # print(test_res) # print(classification_report(y_test, y_test_pred)) metrics[C]["accuracy"].append(test_res["accuracy"]) metrics[C]["precision"].append(test_res["macro avg"]["precision"]) metrics[C]["recall"].append(test_res["macro avg"]["recall"]) metrics[C]["f1"].append(test_res["macro avg"]["f1-score"]) i += 1 # print(metrics) # print("\n") max_recall_C = list(metrics.keys())[0] for C in metrics: if mean(metrics[C]["recall"]) > mean(metrics[max_recall_C]["recall"]): max_recall_C = C # print("\n") print("Best Results:\n") print(max_recall_C) print("Recall: " + str(mean(metrics[max_recall_C]["recall"]))) f.write("\n\nBest Results of Tuner:\n") f.write("\nDegree: " + str(args.degree)) f.write("\nC: " + str(max_recall_C)) f.write("\ngamma: " + str(gamma)) f.write("\nBest Recall Score: " + str(mean(metrics[max_recall_C]["recall"]))) args1 = args args1.currC = max_recall_C trainer(args1, f, writer)
import pandas as pd import numpy as np import torch import torch.autograd as autograd import torch.nn as nn import torch.optim as optim from NER import BiLSTM_CRF import NER import preprocess EMBEDDING_DIM = 5 HIDDEN_DIM = 4 EPOCHS = 30 torch.manual_seed(1) Z_train, Z_test, word2idx, tag2idx = preprocess.prepare_data("train.txt") tag2idx["<START>"] = len(tag2idx) - 2 tag2idx["<STOP>"] = len(tag2idx) - 1 n_tags = len(tag2idx) # ============================================================================= # MODEL # ============================================================================= model = BiLSTM_CRF(len(word2idx), tag2idx, EMBEDDING_DIM, HIDDEN_DIM, n_tags) optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) # ============================================================================= # TRAINING # ============================================================================= for epoch in range(EPOCHS):
from sklearn.neighbors import KNeighborsClassifier import pandas as pd import input_output import models import preprocess import numpy as np from sklearn.preprocessing import MultiLabelBinarizer # Define constants TEST_SIZE_SAMPLE = 0.0001 RANDOM_STATE_SPLIT = 38 NUMBER_OF_ENTRIES = 300000 rawData = input_output.load_data("train.csv") processedData = preprocess.prepare_data(rawData) # Separate features and output + scale data # pandas has some weird column counting scaler = StandardScaler() multiBinarizer = MultiLabelBinarizer() trainData = scaler.fit_transform(processedData[:, 0:2]) # Convert output to binarized array numbers = np.reshape((processedData[:, 2]), (len(processedData[:, 2]), 1)) predOutput = multiBinarizer.fit_transform(numbers) X_train, X_test, y_train, y_test = train_test_split(trainData, predOutput, test_size=TEST_SIZE_SAMPLE, random_state=RANDOM_STATE_SPLIT)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--path", type=str, default="dataset/NSEIdaily.csv", help="path of csv file") parser.add_argument("--trading_days", type=int, default=1, help="Number of trading days") parser.add_argument( "--kernel", type=str, default="rbf", help="the kernel for SVM", choices=["linear", "rbf", "poly", "custom", "cobb-douglas"], ) parser.add_argument("--degree", type=int, default=3, help="value of p in polynomial/custom kernel") parser.add_argument( "--C", nargs="+", default=[10**i for i in range(-100, 101)], help="the regularisation parameter for SVM", ) parser.add_argument( "--gamma", type=float, default=1.0, help="the inner product coefficient in polynomial kernel", ) parser.add_argument("--coef0", type=float, default=0.0, help="coefficient for polynomial kernel") parser.add_argument("--train_test_ratio", type=float, default=0.75, help="fraction of train samples") parser.add_argument("--folds", type=int, default=5, help="k in k-fold cross validation") args = parser.parse_args() path = args.path trading_days = args.trading_days kernel = args.kernel degree = args.degree C = C = [float(i) for i in args.C] gamma = args.gamma if kernel == "poly": gamma = 1.0 if kernel == "rbf": gamma = "scale" coef0 = args.coef0 train_test_ratio = args.train_test_ratio folds = args.folds print("Details: ") print("Extracting data from: " + str(path)) print("Trading Days: " + str(trading_days)) if kernel == "poly": assert ( gamma == 1.0 ), "Polynomial should have gamma=1.0, otherwise it is Cobb-Douglas Kernel" print("Kernel: polynomial") print("Degree: " + str(degree)) elif kernel == "cobb-douglas": assert ( gamma != 1.0 ), "Cobb-Douglas should have gamma!=1.0, otherwise it is Polynomial Kernel" print("Kernel: cobb-douglas") print("Gamma: " + str(gamma)) print("Degree: " + str(degree)) kernel = "poly" elif kernel == "custom": print("Kernel: custom") print("Degree: " + str(degree)) else: print("Kernel: " + str(kernel)) # print("Regularisation Parameter, C: " + str(C)) # define the custom kernels def poly_cobb_kernel(X, Y): return gamma * (np.dot(X, Y.T))**degree def custom_kernel(X, Y): return 1 / (1 + np.dot(X, Y.T)**degree) # load the dataset df = load_csv(path) data = prepare_data( data_f=df, horizon=trading_days, alpha=0.9, ) # remove the output from the input features = [x for x in data.columns if x not in ["gain"]] dataA = np.array_split(data[features], folds) features = [x for x in data.columns if x not in ["gain", "pred"]] X = np.array(dataA[0][features]) y = np.array(dataA[0]["pred"]) # tscv = TimeSeriesSplit(n_splits=folds) param_grid = {"svc__C": C} print("\n") metrics = {} for C in param_grid["svc__C"]: metrics[C] = {"accuracy": [], "precision": [], "recall": [], "f1": []} for i in tqdm(range(no_of_subsamples)): features = [x for x in data.columns if x not in ["gain", "pred"]] X = dataA[i][features] y = dataA[i]["pred"] print("Performing Grid (Time Series) Search on:\n") print("C: " + str(C)) print("gamma: " + str(gamma)) print("Fold #" + str(i + 1)) print("\n") X_train = X[:int(train_test_ratio * len(X))] y_train = y[:int(train_test_ratio * len(y))] X_test = X[int(train_test_ratio * len(X)):] y_test = y[int(train_test_ratio * len(y)):] if kernel == "custom": clf = make_pipeline( StandardScaler(), SVC( kernel=custom_kernel, C=C, class_weight="balanced", cache_size=100000, ), ) elif kernel == "poly": clf = make_pipeline( StandardScaler(), SVC( kernel=poly_cobb_kernel, C=C, class_weight="balanced", cache_size=100000, ), ) else: clf = make_pipeline( StandardScaler(), SVC( kernel=kernel, degree=degree, C=C, coef0=coef0, gamma=gamma, class_weight="balanced", cache_size=100000, ), ) clf.fit(X_train, y_train) print("Training Report:") y_train_pred = clf.predict(X_train) train_res = classification_report(y_train, y_train_pred, output_dict=True) print(classification_report(y_train, y_train_pred)) print("\n") print("Test Report:") y_test_pred = clf.predict(X_test) test_res = classification_report(y_test, y_test_pred, output_dict=True) print(classification_report(y_test, y_test_pred)) metrics[C]["accuracy"].append(test_res["accuracy"]) metrics[C]["precision"].append(test_res["macro avg"]["precision"]) metrics[C]["recall"].append(test_res["macro avg"]["recall"]) metrics[C]["f1"].append(test_res["macro avg"]["f1-score"]) i += 1 print(metrics) print("\n") max_recall_C = list(metrics.keys())[0] for C in metrics: print("For regularisation parameter: " + str(C)) print("Accuracy: " + str(mean(metrics[C]["accuracy"]))) print("Precision: " + str(mean(metrics[C]["precision"]))) print("Recall: " + str(mean(metrics[C]["recall"]))) print("F1: " + str(mean(metrics[C]["f1"]))) if mean(metrics[C]["recall"]) > mean(metrics[max_recall_C]["recall"]): max_recall_C = C print("\n") print("Best Results:\n") print(max_recall_C) print("Accuracy: " + str(mean(metrics[C]["accuracy"]))) print("Precision: " + str(mean(metrics[max_recall_C]["precision"]))) print("Recall: " + str(mean(metrics[max_recall_C]["recall"]))) print("F1: " + str(mean(metrics[max_recall_C]["f1"])))
# test_documents = [] # for document in documents1: # if document.name in pmids: # test_documents.append(document) dev_documents = [] for document in documents1: if document.name in pmids: dev_documents.append(document) test_documents = [] abbr_dict = load_abbr(config['ncbi_abbr']) logging.info("loading dictionary ... ") dictionary = load_ctd(config['norm_dict']) logging.info("generate data points") train_datapoints = prepare_data(train_documents, abbr_dict, dictionary) dev_datapoints = prepare_data_1( dev_documents, abbr_dict, dictionary ) # we use dev_datapoints and test_datapoints only for build alphabet if len(test_documents) != 0: test_datapoints = prepare_data_1(test_documents, abbr_dict, dictionary) if opt.pretraining: dict_datapoints = prepare_dict_data(dictionary) logging.info("build alphabet ...") enc_word_alphabet = Alphabet('enc_word') if opt.use_char: enc_char_alphabet = Alphabet('enc_char') else: enc_char_alphabet = None
def trainer(args, f, writer): path = args.path trading_days = args.trading_days no_of_subsamples = args.no_of_subsamples kernel = args.kernel degree = args.degree C = args.currC gamma = args.gamma if kernel == "poly": gamma = 1.0 if kernel == "rbf": gamma = "scale" coef0 = args.coef0 train_test_ratio = args.train_test_ratio print("Details: ") print("Extracting data from: " + str(path)) print("Trading Days: " + str(trading_days)) print("Number of Subsamples: " + str(no_of_subsamples)) if kernel == "poly": assert ( gamma == 1.0 ), "Polynomial should have gamma=1.0, otherwise it is Cobb-Douglas Kernel" print("Kernel: polynomial") print("Degree: " + str(degree)) elif kernel == "cobb-douglas": assert ( gamma != 1.0 ), "Cobb-Douglas should have gamma!=1.0, otherwise it is Polynomial Kernel" print("Kernel: cobb-douglas") print("Gamma: " + str(gamma)) print("Degree: " + str(degree)) kernel = "poly" elif kernel == "custom": print("Kernel: custom") print("Degree: " + str(degree)) else: print("Kernel: " + str(kernel)) print("Regularisation Parameter, C: " + str(C)) # define the custom kernels def poly_cobb_kernel(X, Y): return gamma * (np.dot(X, Y.T)) ** degree def custom_kernel(X, Y): return 1 / (1 + np.dot(X, Y.T) ** degree) df = load_csv(path) data = prepare_data(data_f=df, horizon=trading_days, alpha=0.9,) # remove the output from the input features = [x for x in data.columns if x not in ["gain"]] dataA = np.array_split(data[features], no_of_subsamples) # print(dataA) train_acc, train_prec, train_recall, train_f1 = (0, 0, 0, 0) test_acc, test_prec, test_recall, test_f1 = (0, 0, 0, 0) t0 = time.time() stats = [] for i in tqdm(range(no_of_subsamples)): features = [x for x in data.columns if x not in ["gain", "pred"]] X = dataA[i][features] y = dataA[i]["pred"] # print((y == 1).sum()) # print((y == 0).sum()) X_train = rem_inf(X[: int(train_test_ratio * len(X))]) y_train = y[: int(train_test_ratio * len(y))] X_test = rem_inf(X[int(train_test_ratio * len(X)) :]) y_test = y[int(train_test_ratio * len(y)) :] if kernel == "custom": clf = make_pipeline( StandardScaler(), SVC( kernel=custom_kernel, C=C, class_weight="balanced", cache_size=100000, ), ) elif kernel == "poly": clf = make_pipeline( StandardScaler(), SVC( kernel=poly_cobb_kernel, C=C, class_weight="balanced", cache_size=100000, ), ) else: clf = make_pipeline( StandardScaler(), SVC( kernel=kernel, C=C, degree=degree, coef0=coef0, gamma=gamma, class_weight="balanced", cache_size=100000, ), ) clf.fit(X_train, y_train) metrics = compute_acc(clf, X_train, y_train, X_test, y_test) stats.append(metrics) train_acc += metrics["training"][0] train_prec += metrics["training"][1] train_recall += metrics["training"][2] train_f1 += metrics["training"][3] test_acc += metrics["test"][0] test_prec += metrics["test"][1] test_recall += metrics["test"][2] test_f1 += metrics["test"][3] # print(metrics["training"][4]) # print(metrics["test"][4]) print("\nTime taken: " + str((time.time() - t0) / 60) + " minutes") # for i in range(no_of_subsamples): # print("Stats for Subsample#" + str(i + 1)) # print("Training Accuracy:\t" + str(stats[i]["training"][0])) # print("Training Precision:\t" + str(stats[i]["training"][1])) # print("Training Recall:\t" + str(stats[i]["training"][2])) # print("Training F1:\t\t" + str(stats[i]["training"][3])) # print("\n") # print("Test Accuracy:\t\t" + str(stats[i]["test"][0])) # print("Test Precision:\t\t" + str(stats[i]["test"][1])) # print("Test Recall:\t\t" + str(stats[i]["test"][2])) # print("Test F1:\t\t" + str(stats[i]["test"][3])) # print("\n") print("Average Results") print("Average Training Accuracy:\t" + str(train_acc / no_of_subsamples)) print("Average Training Precision:\t" + str(train_prec / no_of_subsamples)) print("Average Training Recall:\t" + str(train_recall / no_of_subsamples)) print("Average Training F1:\t\t" + str(train_f1 / no_of_subsamples)) print("\n") print("Average Test Accuracy:\t\t" + str(test_acc / no_of_subsamples)) print("Average Test Precision:\t\t" + str(test_prec / no_of_subsamples)) print("Average Test Recall:\t\t" + str(test_recall / no_of_subsamples)) print("Average Test F1:\t\t" + str(test_f1 / no_of_subsamples)) f.write("\nAverage Results after Training") f.write("\nAverage Training Accuracy:\t" + str(train_acc / no_of_subsamples)) f.write("\nAverage Training Precision:\t" + str(train_prec / no_of_subsamples)) f.write("\nAverage Training Recall:\t" + str(train_recall / no_of_subsamples)) f.write("\nAverage Training F1:\t\t" + str(train_f1 / no_of_subsamples)) f.write("\n") f.write("\nAverage Test Accuracy:\t\t" + str(test_acc / no_of_subsamples)) f.write("\nAverage Test Precision:\t\t" + str(test_prec / no_of_subsamples)) f.write("\nAverage Test Recall:\t\t" + str(test_recall / no_of_subsamples)) f.write("\nAverage Test F1:\t\t" + str(test_f1 / no_of_subsamples)) writer.writerow( [ degree, gamma, C, train_recall / no_of_subsamples, test_recall / no_of_subsamples, train_prec / no_of_subsamples, test_prec / no_of_subsamples, train_f1 / no_of_subsamples, test_f1 / no_of_subsamples, train_acc / no_of_subsamples, test_acc / no_of_subsamples, ] )
logfile = datetime.datetime.now().strftime("logs/%Y%m%d_%H%M") + ".log" if __name__ == "__main__": n_jobs = 4 use_cache = False logger = False if not use_cache: n_drivers = 10000 n_jobs = 4 windows = [1, 15, 30, 60] part = 4 n_quantiles = 15 size = None fname = "data/processed_part%i_q%s_%s.csv" % (part, n_quantiles, "w".join([str(w) for w in [""] + windows])) data = prepare_data(n_drivers, windows, n_quantiles, part, size, n_jobs) data.to_csv(fname) else: # use cache t = time() print "Loading cache...", data = pd.DataFrame.from_csv("data/processed.csv") print "DONE! %.2fm" % ((time() - t) / 60.0) eta_iteration = (np.array([2, 3, 4, 5, 10, 50, 100]) * n_jobs).tolist() + ( np.array(range(200, 3000, 100) * n_jobs).tolist() ) probas = [] t = time() print "Predicting... estimated time:", if n_jobs > 1:
if __name__ == '__main__': n_jobs = 4 use_cache = False logger = False if not use_cache: n_drivers = 10000 n_jobs = 4 windows = [1, 15, 30, 60] part = 4 n_quantiles = 15 size = None fname = "data/processed_part%i_q%s_%s.csv" % ( part, n_quantiles, 'w'.join([str(w) for w in [''] + windows])) data = prepare_data(n_drivers, windows, n_quantiles, part, size, n_jobs) data.to_csv(fname) else: # use cache t = time() print("Loading cache...", end=' ') data = pd.DataFrame.from_csv("data/processed.csv") print("DONE! %.2fm" % ((time() - t) / 60.)) eta_iteration = (np.array([2, 3, 4, 5, 10, 50, 100]) * n_jobs).tolist() + ( np.array(list(range(200, 3000, 100)) * n_jobs).tolist()) probas = [] t = time() print("Predicting... estimated time:", end=' ') if n_jobs > 1: # initialize logger and pool args
parser = argparse.ArgumentParser(description="Trains image data.") parser.add_argument("num_iterations", help="path to directory that contains data") parser.add_argument("learning_rate", help="name of the fruit which you which to classify") parser.add_argument("directory_path", help="path to directory that contains data") parser.add_argument("fruit", help="name of the fruit which you which to classify") parser.add_argument("image_path", help="path to image that you'd like to classify") args = parser.parse_args() # prepares the data x_train, x_test, y_train, y_test = preprocess.prepare_data( args.directory_path, args.fruit) # trains the model model = train.model(x_train, y_train, x_test, y_test, num_iterations=int(args.num_iterations), learning_rate=float(args.learning_rate), print_cost=False) # loads and reads the image you want to classify im_pix = imageio.imread(args.image_path) im_pix = im_pix / 255. im_pix = resize(im_pix, (100, 100), anti_aliasing=True) im_pix = im_pix.reshape(100 * 100 * 3, 1)