def model_checker(df, vectorizer, classifier, sampling_method): print(classifier) print('\n') trainTweet, testTweet, trainLabel, testLabel = train_test_split( df, sampling_method) pipeline = Pipeline([('vectorizer', vectorizer), ('classifier', classifier)]) t0 = time() sentiment_fit = pipeline.fit(trainTweet, trainLabel) y_pred = sentiment_fit.predict(testTweet) train_test_time = time() - t0 accuracy = accuracy_score(testLabel, y_pred) confusion_result = confusion_matrix(y_pred, testLabel) print("accuracy score: {0:.2f}%".format(accuracy * 100)) print("train and test time: {0:.2f}s".format(train_test_time)) print('-' * 80) print("Confusion Matrix\n") print(pd.DataFrame(confusion_result)) print('-' * 80) print("Classification Report\n") print(classification_report(testLabel, y_pred))
def main(config_filename): logger.debug("Starting execution.") parameters = Parameters(config_filename, training_mode=True) if parameters.preprocessed_data: if not isfile(parameters.excel_file) and not isfile(parameters.preprocessed_data_file): logger.error("Please, provide a valid Excel file or a valid preprocessed data file.") quit() if not isfile(parameters.preprocessed_data_file) and isfile(parameters.excel_file): logger.info("Loading Excel file.") data_frame = read_excel(parameters.excel_file) logger.info("Creating documents.") docs = data_frame_to_document_list(data_frame) logger.info("Storing generated documents.") pickle_manager.dump_documents(docs, parameters.preprocessed_data_file) logger.info("Preprocessing documents.") preprocessor = Preprocessor(stanfordnlp_language_package=parameters.stanfordnlp_language_package, stanfordnlp_use_gpu=parameters.stanfordnlp_use_gpu, stanfordnlp_resources_dir=parameters.stanfordnlp_resources_dir, training_mode=parameters.training_mode) preprocessor.preprocess(text_field=parameters.excel_column_with_text_data, preprocessed_data_file=parameters.preprocessed_data_file) logger.info("Checking generated data.") pickle_manager.check_data(parameters.preprocessed_data_file) else: if not isfile(parameters.preprocessed_data_file): logger.error("The indicated preprocessed data file does not exist.") quit() logger.info("Extracting features.") feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=parameters.training_mode, use_lda=parameters.use_lda, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, features_file=parameters.features_file) X, y, _lemmas = feature_extractor.generate_X_y(class_field=parameters.excel_column_with_classification_data, preprocessed_data_file=parameters.preprocessed_data_file) logger.info("Splitting dataset into training and test subsets.") train_test_split(y, parameters.test_subset_size, parameters.preprocessed_data_file, parameters.force_subsets_regeneration) logger.info("Running classifiers.") p = classifiers.Pipeline(parameters.classifiers, parameters.cross_validate) metadata = pickle_manager.get_docs_metadata(parameters.preprocessed_data_file) training_set_indexes = metadata['training_set_indexes'].tolist() test_set_indexes = metadata['test_set_indexes'].tolist() assert len(training_set_indexes) == len(set(training_set_indexes)) assert len(test_set_indexes) == len(set(test_set_indexes)) for elem in feature_extractor.to_remove: try: training_set_indexes.remove(elem) except ValueError: test_set_indexes.remove(elem) logger.info("Accuracies:") p.start(X, y, parameters.number_of_jobs, parameters.set_num_accepted_probs, training_set_indexes, test_set_indexes, parameters.resampling) logger.debug("Execution completed.")
def Word2Vec_Model(df, classifier,sampling_method): print(classifier) print('\n') GloveModel = load_glove_model("glove.twitter.27B.100d.txt") trainTweet,testTweet,trainLabel,testLabel = train_test_split(df,sampling_method) pipeline = Pipeline([('classifier',classifier)]) global count_total, count_in, count_out global out_words_list count_total, count_in, count_out = 0, 0, 0 out_words_list = [] trainVec = get_tweet_vectors(trainTweet, GloveModel, 100) # it has to be same as read in txt dimension which is 200. testVec = get_tweet_vectors(testTweet, GloveModel, 100) # glove.twitter.27B.200d.txt print("Glove word embedding statistic\n", "count_total: %d/" %count_total, "count_in: %d/" %count_in, "count_out: %d/" %count_out) print("Number of unique words without embedding: %d" %len(set(out_words_list))) print("Words without embedding: \n", set(out_words_list)) t0 = time() pipeline.fit(trainVec,trainLabel) y_pred = pipeline.predict(testVec) train_test_time = time() - t0 accuracy = accuracy_score(testLabel,y_pred) confusion_result = confusion_matrix(y_pred,testLabel) print("accuracy score: {0:.2f}%".format(accuracy*100)) print("train and test time: {0:.2f}s".format(train_test_time)) print('-'*80) print ("Confusion Matrix\n") print (pd.DataFrame(confusion_result)) print('-'*80) print ("Classification Report\n") print (classification_report(testLabel,y_pred))
def main(args): # Make a directory to save models if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Preprocess the RRM data vocab, df_aligned = preprocess(preprocessed=args.preprocessed, RRM_path=args.aligned_RRM_path, output_path=args.processed_RRM_path, sep=args.sep) df_aligned = train_test_split(df_aligned) with open(os.path.join(args.model_path, 'vocab.pkl'), 'wb') as f: pickle.dump(vocab, f) # Prepare the training and validation sets train_index = pd.read_csv('../data/train_index.csv', header=None).iloc[:, 0] train_loader = RRM_Sequence(df_aligned.loc[train_index, :], vocab) train_loader = DataLoader(train_loader, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn) val_index = pd.read_csv('../data/val_index.csv', header=None).iloc[:, 0] val_loader = RRM_Sequence(df_aligned.loc[val_index, :], vocab) val_loader = DataLoader(val_loader, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn) # Define the models encoder = ResNetEncoder(df_aligned.shape[1], len(vocab), args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Use CUDA if available if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Define the loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(train_loader) val_loss_history = [] for epoch_num, epoch in enumerate(range(args.num_epochs)): for batch_idx, (names, rrms_aligned, rrms_unaligned, lengths) in enumerate(train_loader): rrms_aligned = to_var(rrms_aligned) rrms_unaligned = to_var(rrms_unaligned) targets = pack_padded_sequence(rrms_unaligned, lengths, batch_first=True)[0] # Forward, backward, and optimize decoder.zero_grad() encoder.zero_grad() features = encoder(rrms_aligned) outputs = decoder(features, rrms_unaligned, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if (batch_idx + 1) % args.log_step == 0: val_loss = validate(val_loader, encoder, decoder, criterion) val_loss_history.append(val_loss) print( 'Epoch [%d/%d], Step [%d/%d], Training Loss: %.4f, Validation loss: %.4f' % (epoch + 1, args.num_epochs, batch_idx + 1, total_step, loss.data[0], val_loss)) stop = early_stop(val_loss_history) if stop: print( '=== Early stopping === Validation loss not improving significantly ===' ) torch.save( decoder.state_dict(), os.path.join( args.model_path, 'decoder-anneal%s-%dcolumns-%d-%d.pkl' % (args.learning_rate_annealing, df_aligned.shape[1], epoch + 1, batch_idx + 1))) torch.save( encoder.state_dict(), os.path.join( args.model_path, 'encoder-anneal%s-%dcolumns-%d-%d.pkl' % (args.learning_rate_annealing, df_aligned.shape[1], epoch + 1, batch_idx + 1))) break # Save the models if (batch_idx + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join( args.model_path, 'decoder-anneal%s-%dcolumns-%d-%d.pkl' % (args.learning_rate_annealing, df_aligned.shape[1], epoch + 1, batch_idx + 1))) torch.save( encoder.state_dict(), os.path.join( args.model_path, 'encoder-anneal%s-%dcolumns-%d-%d.pkl' % (args.learning_rate_annealing, df_aligned.shape[1], epoch + 1, batch_idx + 1))) # Decay the learning rate if specified if args.learning_rate_annealing: adjust_learning_rate(optimizer, epoch + 1) if stop: break
cross_corrs = np.zeros(st.nclusters) for i, cl in enumerate(st.clusters): sta = data['stas'][i] spikes = st.binnedspiketimes(i) import time start = time.time() # Calculate the contrast for each cell's receptive field stimulus[-1, :] = st.contrast_signal_cell(i) sp_tr, sp_te, stim_tr, stim_te = train_test_split(spikes, stimulus, test_size=val_split_size, split_pos=val_split_pos) res = gqm.minimize_loglikelihood(np.zeros((stimdim, fl)), np.zeros((stimdim, fl, fl)), 0, stim_tr, st.frame_duration, sp_tr, minimize_disp=True, method='BFGS') elapsed = time.time()-start print(f'Time elapsed: {elapsed/60:6.1f} mins for cell {i}') k_out, Q_out, mu_out = gqm.splitpars(res.x) kall[i, ...] = k_out Qall[i, ...] = Q_out
def testImages(): import sys import numpy sys.path.append("./preprocess/general_preprocessing") from train_test_split import train_test_split train_test_split(".\datasets\several_faces_dataset")
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from train_test_split import train_test_split from LinearRegression import LinearRegression boston = datasets.load_boston() X = boston.data y = boston.target X = X[y < 50.0] y = y[y < 50.0] X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666) reg = LinearRegression() reg.fit_normal(X_train, y_train) print(reg.interception_) print(reg.coef_) print(reg.score(X_test, y_test))
from resizing_images import resize_images from train_test_split import train_test_split from generating_facemarks import collect_facemarks from read_write import write_csv from read_write import write_npy import configparser cf = configparser.ConfigParser() cf.read("./config.ini") if __name__ == "__main__": #split the train and test data // trData,clPrtestData,openPrTestData print("Splitting the data...") dataPaths = [] dataPaths = train_test_split(cf.get("preprocess", "preprocessDataset")) print("DONE") #resize the data print("Resizing the images...") print("!ALERT!:resizing inplace!!!") dimensionsXY = cf.getint("preprocess", "forcedImageSizeXY") #uncomment if resize needed #resize_images(dataPaths,dimensionsXY,dimensionsXY) #augment with facemarks #NOTE: we remove the images that the model fails to produce facemarks to print("Adding facemarks to the dataset") collect_facemarks(dataPaths) print("DONE") print("Saving the data...")
kall = np.zeros((st.nclusters, 2, xval_splits, st.filter_length)) muall = np.zeros((st.nclusters, xval_splits)) frs = np.zeros((all_spikes.shape[0], int(all_spikes.shape[-1]/xval_splits))) cross_corrs = np.zeros((st.nclusters, xval_splits)) t = np.linspace(0, st.filter_length*st.frame_duration*1000, st.filter_length) stimulus = st.bgsteps plotlabels = ['Motion X', 'Motion Y'] for i, cluster in enumerate(st.clusters): for xvi in range(xval_splits): sp_tr, sp_te, stim_tr, stim_te = train_test_split(all_spikes[i], stimulus, test_size=xval_fraction, split_pos=xval_fraction*xvi) res = glm.minimize_loglhd(np.zeros((2, st.filter_length)), 0, stim_tr, st.frame_duration, sp_tr, usegrad=True, method='BFGS') if not res['success']: print(i, 'did not complete successfully.') # kall[i, ...] = res['x'][:-1] # muall[i] = res['x'][-1] kall[i, :, xvi, ...], muall[i, xvi] = glm.splitpars(res['x']) frs[i, :] = glm.glm_neuron(kall[i, :, xvi, ...],
import matplotlib.pyplot as plt from sklearn import datasets from train_test_split import train_test_split from SimpleLinearRegression import SimpleLinearRegression2 import metrics as me from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error import sklearn.metrics as skm boston = datasets.load_boston() # print(boston.DESCR) x = boston.data[:, 5] y = boston.target x = x[y < 50.0] y = y[y < 50.0] x_train, x_test, y_train, y_test = train_test_split(x, y, seed=666) # print(x_train.shape) # print(x_test.shape) reg = SimpleLinearRegression2() reg.fit(x_train, y_train) # print(reg.a_) # print(reg.b_) y_predict = reg.predict(x_test) # print(y_predict) """ # MSE误差 mse_test = np.sum((y_predict - y_test) ** 2) / len(y_test) # mse_test = me.mean_squared_error(y_test, y_predict) print(mse_test)
from plot import plot_results, get_z from prepare_data import prepare_data from train_test_split import train_test_split import parameters dataset_names = ['chips', 'geyser'] dataset_name = dataset_names[0] filename = 'data/{}.csv'.format(dataset_name) dataset, features, labels = prepare_data(filename, normalization=False) train_test_ratio = 0.8 train_set, train_features, train_labels, test_set, test_features, test_labels = train_test_split(dataset, train_test_ratio) results = [] for n_estimators in parameters.n_estimators: for learning_rate in parameters.learning_rate: model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate) model.fit(train_features, train_labels) predicted_labels = model.predict(test_features) f_score = f1_score(test_labels, predicted_labels, average='binary', pos_label='P') results.append({'f_score': f_score, 'n_estimators': n_estimators, 'learning_rate': learning_rate})
# define the test location, increment, depth interval test_loc = [400, 700] #400,700 550,900 test_inc = 100 dz = 0.1524 plot_well_feature(x_trainwell, test_loc, test_inc, dz) plot_well_target(y_trainwell, test_loc, test_inc, dz) #%% from train_test_split import train_test_split X_train, Y_train, X_test, Y_test = train_test_split(x_trainwell, y_trainwell, test_loc, test_inc) pd_data = pd.DataFrame(data=X_train, columns=features) g = sns.pairplot(pd_data, corner=True, markers="o", plot_kws=dict(s=5, edgecolor="b", linewidth=1)) g.fig.set_figwidth(8) g.fig.set_figheight(8) plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1) #%% # standize the matrix for training data scaler = StandardScaler()