def rf_model(train, target, test, text_train_tfidf, text_test_tfidf): text_train = train["Title"].values + ". " + train["BodyMarkdown"].values text_test = test["Title"].values + ". " + test["BodyMarkdown"].values print("Creating word2vec model...") w2v.make_word2vec_model(text_train, text_test) wv_train, wv_test = w2v.word2vec_features(text_train, text_test, load=False) X_train, X_test = fe.extract_features(train), fe.extract_features(test) X_train, X_test = fe.categories_to_counters(X_train, X_test, target) X_train, X_test = fe.transform_features(X_train, X_test) print("Creating linear model metafeature...") X_train["LinearModelText"], X_test["LinearModelText"] = mf.linear_model_as_feature(text_train_tfidf, target, text_test_tfidf, load=False) print("Creating word2vec model metafeature...") X_train["w2vModelRFText"], X_test["w2vModelRFText"] = mf.w2v_model_as_feature(wv_train, target, wv_test, load=False, model_to_train="rf") scaler = sklearn.preprocessing.StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) model = sklearn.ensemble.RandomForestClassifier(criterion="entropy", max_depth=14, n_estimators=2000, min_samples_leaf=4, min_samples_split=16, n_jobs=4, random_state=1234) result = make_predictions(model, X_train, target, X_test) io.save_result(test["PostId"], result) return result
def linear_model(train, target, test, text_train_tfidf, text_test_tfidf): X_train, X_test = fe.extract_features(train), fe.extract_features(test) X_train, X_test = fe.categories_to_counters(X_train, X_test, target) X_train, X_test = fe.transform_features(X_train, X_test) feature_train = np.load("w2v/word2vec_feature_train") feature_test = np.load("w2v/word2vec_feature_test") X_train = np.column_stack((X_train.values, feature_train)) X_test = np.column_stack((X_test.values, feature_test)) scaler = sklearn.preprocessing.StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) X_train = scipy.sparse.hstack((text_train_tfidf, X_train), format="csr") X_test = scipy.sparse.hstack((text_test_tfidf, X_test), format="csr") model = sklearn.linear_model.LogisticRegression(C=0.7, penalty="l2") result = make_predictions(model, X_train, target, X_test) io.save_result(test["PostId"], result) return result
def main(): pkl_file = open(pickle_file_name, 'rb') _, patient_data, matched_data = pickle.load(pkl_file) pkl_file.close() # get training data # get OD data all_encodings, target_encoding = fe.extract_features(matched_data) bin_target_encoding = [[1] for i in range(len(target_encoding))] print('target_encoding num : {}'.format(len(target_encoding))) general_samples = ds.draw_general_sample(GEN_SAMPLE_NUM, patient_data, 'race', [0.805, 0.134, 0.037]) print(len(matched_data)) print(len(general_samples)) general_sample_encodings, general_sample_target_encoding = fe.extract_features(general_samples, True) bin_general_sample_target_encoding = [[0] for i in range(len(general_sample_encodings))] print('general_encoding num : {}'.format(len(general_sample_target_encoding))) # create X and Y X = all_encodings X.extend(general_sample_encodings) # for categorical prediciton # Y = target_encoding # Y.extend(general_sample_target_encoding) # for binardy prediction Overdose vs non-Overdose Y = bin_target_encoding Y.extend(bin_general_sample_target_encoding) print(Y) # build NN model # for categorical prediction # nn_model = nn.NN(X, Y) # nn_model.train() # for binary prediction nn_model = nn.Bin_NN(X, Y) nn_model.train()
def prepare_data(save_scaler=True, location='./db.p'): """ Prepares the data for training. It does so by extracting the features from both cars and non cars datasets. :param save_scaler: Flag that indicates if we should persist our scaler in database (i.e. the pickle file). :param location: Path of the pickle file that contains the data. :return: Features and labels ready to be passed to a classifier. """ print("Loading training data") cars = __db['cars_features'] cars_labels = __db['cars_labels'] non_cars = __db['non_cars_features'] non_cars_labels = __db['non_cars_labels'] parameters = __db['parameters'] print("Extracting features...") cars_extracted_features = extract_features( images=cars, color_space=parameters['color_space'], hog_channels=parameters['hog_channels'], orient=parameters['orientations'], pix_per_cell=parameters['pix_per_cell'], cell_per_block=parameters['cell_per_block'], histogram_bins=parameters['number_of_bins'], spatial_size=parameters['spatial_size'], spatial_feat=parameters['spatial_features'], hist_feat=parameters['histogram_features'], hog_feat=parameters['hog_features']) non_cars_extracted_features = extract_features( images=non_cars, color_space=parameters['color_space'], hog_channels=parameters['hog_channels'], orient=parameters['orientations'], pix_per_cell=parameters['pix_per_cell'], cell_per_block=parameters['cell_per_block'], histogram_bins=parameters['number_of_bins'], spatial_size=parameters['spatial_size'], spatial_feat=parameters['spatial_features'], hist_feat=parameters['histogram_features'], hog_feat=parameters['hog_features']) features = np.vstack((cars_extracted_features, non_cars_extracted_features)).astype(np.float64) labels = np.hstack((cars_labels, non_cars_labels)) print("Normalizing...") scaler = get_normalizer(features) if save_scaler: with open(location, 'wb') as pickle_file: __db['scaler'] = scaler pickle.dump(__db, pickle_file) features = scaler.transform(features) print("Done!") return features, labels
def test_feature_extraction(self): audio, sr = get_audio('lIYCHbOTab4') result = extract_features(audio, sr) self.assertEqual((22, 18424), result.shape, "frame features extraction") result2 = extract_features(audio, sr, mfcc=False) self.assertEqual((2, 18424), result2.shape, "frame features extraction")
def build_training_data(symbol_files, junk_files=[], segment_data_func=None, print_progress=True, ground_truth_file=None): """ Given the symbol files as input, create a dataframe from the given data Parameters: 1. symbol_files (list) - list of symbol file names Returns: 1. df (Dataframe) - A pandas dataframe representation of the data """ df = pd.DataFrame([]) # contains both junk and symbol files if ground_truth_file: ui_to_symbols = map_ids_to_symbols(ground_truth_file) all_files = symbol_files[:] all_files.extend(junk_files) num_files = len(all_files) row_num = 0 for data_file in all_files: # segmentation to be done here trace_dict, unique_id = create_trace_dict(data_file) if segment_data_func: segemented_trace_dicts = segment_data_func(trace_dict) unique_id = data_file.split('.')[0].split('/')[-1] for segmented_trace_dict in segemented_trace_dicts: row = extract_features(segmented_trace_dict, unique_id) row['TRACES'] = list(segmented_trace_dict.keys()) if ground_truth_file: row['SYMBOL_REPRESENTATION'] = ui_to_symbols[row['UI']] if len(df.columns) == 0: df = pd.DataFrame(columns=[n for n in row.keys()]) df.loc[row_num] = list(row.values()) row_num += 1 else: row = extract_features(trace_dict, unique_id) if ground_truth_file: row['SYMBOL_REPRESENTATION'] = ui_to_symbols[ row['UI']] if row['UI'] in ui_to_symbols else 'junk' if len(df.columns) == 0: df = pd.DataFrame(columns=[n for n in row.keys()]) df.loc[row_num] = list(row.values()) percentage = num_files // 100 if print_progress and percentage != 0 and row_num % percentage == 0: # print('{0} ({1}%) of {2} files loaded...'.format(row_num, round((row_num/num_files)*100), num_files)) print('File \'{}\' processed.'.format(data_file)) row_num += 1 print('All files loaded.') return df # use this to operate on the data
def main(): """ Main function """ # Extract features if not feature_file_exists(): extract_features() # Select features if not select_feature_file_exists(): select_features() # Train model train()
def test_model(file_name): gmm_files = [ os.path.join(modelpath, fname) for fname in os.listdir(modelpath) if fname.endswith('.gmm') ] models = [pickle.load(open(fname, 'rb')) for fname in gmm_files] speakers = [fname.split('/')[-1].split('.gmm')[0] for fname in gmm_files] # print("do you want to test a single audio : press 1 or complete press 0") # take=int(input().strip()) # if take==1: # print("enter the file name") # path=input().strip() # print("testing audio",path) path = file_name sr, audio = read(source + path) vector = extract_features(audio, sr) log_likelihood = np.zeros(len(models)) for i in range(len(models)): gmm = models[i] scores = np.array(gmm.score(vector)) log_likelihood[i] = scores.sum() max1 = max(log_likelihood) if max1 < -30: return "You are not Register..." # winner=np.argmax(log_likelihood) else: winner = np.where(log_likelihood == max1) return speakers[winner[0][0]]
def load(): """ Load all training points into a matrix """ Xs = [] Ys = [] for path, label in ((NEGATIVE_SAMPLES_DIR, 0), (POSITIVE_SAMPLES_DIR, 1)): audio_files = glob.glob(os.path.join(path, '*.wav')) logging.info('Detected {} files for label {}'.format( len(audio_files), label)) Xs.append(np.zeros((len(audio_files), NUM_FEATURES))) Ys.extend([label] * len(audio_files)) for i, audio_file in enumerate(audio_files): rate, data = scipy.io.wavfile.read(audio_file) assert rate == SAMPLE_RATE Xs[-1][i, :] = extract_features(data) if i % 1000 == 0: logging.info('Loaded {} files'.format(i)) X = np.vstack(Xs) y = np.array(Ys) return X, y
def process_test_samples(test_samples): processed_samples = pd.DataFrame() samples_in_one_srch = pd.DataFrame() for r_idx, sample in test_samples.iterrows(): if (r_idx + 1) % 1000 == 0: print "Processed %i sample of %i" % (r_idx + 1, test_samples.shape[0]) is_next_in_same_search = True samples_in_one_srch = pd.concat((sample.to_frame().transpose(), samples_in_one_srch), axis=0) current_srch_id = sample['srch_id'] if (r_idx + 1) == test_samples.shape[0]: is_next_in_same_search = False else: next_srch_id = test_samples['srch_id'][r_idx + 1] if current_srch_id != next_srch_id: is_next_in_same_search = False if not is_next_in_same_search: ## if next one is not in the same search process the samples in the same search # feature extraction for samples ext_samples_in_one_srch = extract_features(samples_in_one_srch) processed_samples = pd.concat((processed_samples, ext_samples_in_one_srch), axis=0) # create new samples for the next search samples_in_one_srch = pd.DataFrame() return processed_samples
def svm(train_docs, train_keys, test_docs, test_keys, model_file, N): X_train, y_train, phrase_list_train, idf_vec = extract_features(train_docs, train_keys) #X_test, y_test, fl_test, junk = extract_features(test_docs, test_keys) #print y_train print "--Feature matrices calculated, SVM now training..." clf = train_svm(X_train, y_train) print "--Saving model..." with open(model_file, 'w') as f: pickle.dump(model_file, f) print "--SVM trained, SVM now testing..." accuracy = 0 precisions = [] recalls = [] for doc, true_keys in zip(test_docs, test_keys): candidates, features = extract_candidates_doc(doc, phrase_list_train, idf_vec, len(train_docs)) precision, recall = evaluate_one_doc('svm', clf, candidates, features, true_keys, N) precisions.append(precision) recalls.append(recall) avg_precision = sum(precisions) / len(precisions) avg_recall = sum(recalls) / len(recalls) ''' accuracy = test_svm(svm, X_test, y_test) features_doc, labels_doc, phrase_idx_doc, phrase_list = extract_features_test(test_docs, test_keys) avg_precision, avg_recall = evaluate_on_each_doc('svm', svm, features_doc, labels_doc, phrase_idx_doc, phrase_list, test_keys) ''' return {'accuracy': accuracy, 'recall': avg_recall, 'precision': avg_precision}
def svm(train_docs, train_keys, test_docs, test_keys, model_file, N): X_train, y_train, phrase_list_train, idf_vec = extract_features( train_docs, train_keys) #X_test, y_test, fl_test, junk = extract_features(test_docs, test_keys) #print y_train print "--Feature matrices calculated, SVM now training..." clf = train_svm(X_train, y_train) print "--Saving model..." with open(model_file, 'w') as f: pickle.dump(model_file, f) print "--SVM trained, SVM now testing..." accuracy = 0 precisions = [] recalls = [] for doc, true_keys in zip(test_docs, test_keys): candidates, features = extract_candidates_doc(doc, phrase_list_train, idf_vec, len(train_docs)) precision, recall = evaluate_one_doc('svm', clf, candidates, features, true_keys, N) precisions.append(precision) recalls.append(recall) avg_precision = sum(precisions) / len(precisions) avg_recall = sum(recalls) / len(recalls) ''' accuracy = test_svm(svm, X_test, y_test) features_doc, labels_doc, phrase_idx_doc, phrase_list = extract_features_test(test_docs, test_keys) avg_precision, avg_recall = evaluate_on_each_doc('svm', svm, features_doc, labels_doc, phrase_idx_doc, phrase_list, test_keys) ''' return { 'accuracy': accuracy, 'recall': avg_recall, 'precision': avg_precision }
def grid_search(): print(FILE_NAME) svm_features, svm_labels = extract_features(config.TRAIN_DIR,config.sample_count) X = svm_features.reshape(config.sample_count, 7*7*2048) y = svm_labels X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # defining parameter range param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']} grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) # fitting the model for grid search grid.fit(X_train, y_train) # print best parameter after tuning print(grid.best_params_) # print how our model looks after hyper-parameter tuning print(grid.best_estimator_) grid_predictions = grid.predict(X_test) # print classification report print(classification_report(y_test, grid_predictions))
def naive_bayes(train_docs, train_keys, test_docs, test_keys,model_file, N): X_train, y_train, phrase_list_train, idf_vec= extract_features(train_docs, train_keys) #X_test, y_test, fl_test, junk = extract_features(test_docs, test_keys) #print y_train print "--Feature matrices calculated, NB now training..." clf = NB.train(X_train, y_train) print "--Saving model..." with open(model_file, 'w') as f: pickle.dump(clf, f) with open(model_file+'.phrase_list', 'w') as f: pickle.dump(phrase_list_train, f) with open(model_file+'.idf_vec', 'w') as f: pickle.dump(idf_vec, f) with open(model_file+'.training_size', 'w') as f: pickle.dump(len(train_docs), f) print "--NB trained, NB now testing..." #accuracy = NB.score(clf, X_test, y_test) accuracy = 0 precisions = [] recalls = [] for doc, true_keys in zip(test_docs, test_keys): candidates, features = extract_candidates_doc(doc, phrase_list_train, idf_vec, len(train_docs)) precision, recall = evaluate_one_doc('NB', clf, candidates, features, true_keys, N) precisions.append(precision) recalls.append(recall) avg_precision = sum(precisions) / len(precisions) avg_recall = sum(recalls) / len(recalls) #features_doc, labels_doc, phrase_idx_doc, phrase_list = extract_features_test(test_docs, test_keys) #avg_precision, avg_recall = evaluate_on_each_doc('NB', clf, features_doc, labels_doc, phrase_idx_doc, phrase_list, test_keys, 10) return {'accuracy': accuracy, 'recall': avg_recall, 'precision': avg_precision}
def build_features_labels_dataset(events, unique_from=46, unique_to=1006, unique_granularity=1, unique_deltas=[0, 46], to_withdraw=[]): data = [] labels = [] feature_names = None for device in events: for app in events[device]: for action in events[device][app]: label = app + "_" + action for event in events[device][app][action]: features_dict = extract_features( event, unique_from, unique_to, unique_granularity, unique_deltas=unique_deltas, to_withdraw=to_withdraw) features = list(features_dict.values()) data.append(features) labels.append(label) if feature_names is None: feature_names = list(features_dict.keys()) return data, labels, feature_names
def run_grid_search(cnn_filepath='script_CNN.h5', grid_ready_data=None, grid_ready_feature_groups=None): # SET DIRECTORY os.chdir(CURRENT_DIRECTORY) print(f'{datetime.datetime.now()} RUNNING GRID SEARCH') # if not grid_ready_data: # LOAD CLEANED DATA, DICT OF FEATURE GROUPS, AND DTL print(f'{datetime.datetime.now()} 1. Loading/Cleaning Data.') df, feature_dict, DTL = load_and_clean_data() # LOAD CNN, EXTRACT FEATURES, ADD TO FEATURE GROUPS print(f'{datetime.datetime.now()} 2. Extracting Features.') layer_name = 'dense1' model = load_model(cnn_filepath) df_features = extract_features(model, DTL, layer_name) # USE PCA TO SELECT EXTRACTED FEATURES print(f'{datetime.datetime.now()} 3. Performing PCA.') df_features_pca = perform_pca(df_features, 10) feature_dict['EXTRACTED_FEATURES'] = df_features_pca.columns.to_list() feature_dict['EXTRACT_OSM_FB_FEATURES'] = feature_dict['EXTRACTED_FEATURES'] \ + feature_dict['OSM_FB_FEATURES'] # DEFINE FINAL DATA df_final = df.join(df_features_pca) df_final.to_pickle('fully_prepped_data.pkl') # OVERSAMPLE print(f'{datetime.datetime.now()} 4. Defining Sample.') x_df = pd.DataFrame(df_final.drop(labels=['uid', TARGET_NAME], axis=1)) y_df = df_final[TARGET_NAME] feature_dict['ALL_FEATURES'] = x_df.columns.tolist() oversample = RandomOverSampler(sampling_strategy=0.65) x, y = oversample.fit_resample(x_df, y_df) # SPLIT INTO TRAIN/TEST AND NORMALIZE print(f'{datetime.datetime.now()} 5. Defining Training and Testing Sets.') x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE) normalize(x_train, x_test) # TRAIN MODELS AND EXPORT ERRORS print(f'{datetime.datetime.now()} 6. Training Models.') parameters = cf.GRID_TEST_CLASS training_errors = train_models(parameters, x_train, y_train, feature_dict) training_errors.to_csv(os.path.join('output', 'errors.csv')) # PREDICT LABELS AND EVALUATE RESULTS print(f'{datetime.datetime.now()} 7. Evalating Models.') trained_obj_list = [ f for f in os.listdir('output') if f.endswith('_trained.pkl') ] results_df = evaluate_models(trained_obj_list, x_test, y_test, feature_dict) results_df.to_csv(os.path.join('output', 'results.csv'))
def predict(time_serie, CLF, WINDOW_SIZE=20, INTER_SPACING_CRITICAL_POINTS=5, WINDOW_MINIMAL_PAYLOAD=200, MINIMUM_SIZE_CRITICAL_STARTING_POINT=-1, FILTER_LENGTH_LIMIT=46, INTER_TIMER_EVENT_CUTOFF=5, TO_WITHDRAW=[]): """ Make Prediction on a longrun capture Args: time_series : dict['xs', 'ys'] -> [x,],[y,]. Dictonary having 2 entries: xs for the time and ys of packet length WINDOW_SIZE : int. Size in seconds of the Sliding Window. INTER_SPACING_CRITICAL_POINTS : int. Minimum spacing between two critical points. """ critical_points = find_critical_point( time_serie, WINDOW_SIZE, INTER_SPACING_CRITICAL_POINTS, WINDOW_MINIMAL_PAYLOAD, MINIMUM_SIZE_CRITICAL_STARTING_POINT) cap_predict = [] # tuple list of critical_points_i = 0 xs_end = -1 xs_capt = time_serie["xs"] ys_capt = time_serie["ys"] for i, _ in enumerate(xs_capt): current_xs = xs_capt[i] critical_point = critical_points[critical_points_i] if current_xs > critical_point and current_xs > xs_end: j = i - 1 # take previous one since we are one step further xs_start = xs_capt[i] xs_end = find_action_end(xs_capt[i:], ys_capt[i:], FILTER_LENGTH_LIMIT, INTER_TIMER_EVENT_CUTOFF) end_indice = find_x_indices(xs_capt, j, xs_end) xy = dict() xy["xs"] = xs_capt[j:end_indice + 1] xy["ys"] = ys_capt[j:end_indice + 1] features_dict = extract_features(xy, to_withdraw=TO_WITHDRAW) features = list(features_dict.values()) y = CLF.predict_proba(np.array(features).reshape(1, -1)) cap_predict.append((xs_start, xs_end, y[0])) while critical_points[critical_points_i] < xs_end: critical_points_i += 1 if critical_points_i == len(critical_points): break if critical_points_i == len(critical_points) or xs_end == xs_capt[-1]: break return cap_predict
def load_or_train2(X, y, force_train=False, enable_plot=False): clf_path = './clf.pkl' if not force_train and os.path.exists(clf_path): print "> loading from file..." classifier = pickle.load(open(clf_path, 'rb')) print "> loaded" else: print "> training..." X = extract_features(X) #[CV] logistic__C=1.0, rbm__n_iter=80, rbm__learning_rate=0.1, rbm__n_components=200, score=0.931818 - 5.3min ## initialize the RBM + Logistic Regression pipeline rbm = BernoulliRBM(learning_rate=0.1, n_iter=80, n_components=200) logistic = LogisticRegression(C=1.0) classifier = Pipeline([("rbm", rbm), ("logistic", logistic)]) # Perform a grid search on the learning rate, number of iterations, and number of components on the RBM and # C for Logistic Regression print "SEARCHING RBM + LOGISTIC REGRESSION" params = { "rbm__learning_rate": [0.1, 0.01, 0.001], "rbm__n_iter": [20, 40, 80], "rbm__n_components": [50, 100, 200], "logistic__C": [1.0, 10.0, 100.0]} ## Cross-validation # cv = ShuffleSplit(len(X), n_iter=10, test_size=0.2, random_state=0) ## Perform a grid search over the parameter start = time.time() # gs = GridSearchCV(classifier, params, verbose=10, cv=cv) # gs.fit(X, y) classifier.fit(X, y) ## Print diagnostic information to the user and grab the best model print "\ndone in %0.3fs" % (time.time() - start) # print "best score: %0.3f" % (gs.best_score_) # print "RBM + LOGISTIC REGRESSION PARAMETERS" # bestParams = gs.best_estimator_.get_params() # loop over the parameters and print each of them out # so they can be manually set # for p in sorted(params.keys()): # print "\t %s: %f" % (p, bestParams[p]) ## show information about the training if enable_plot or True: outputs = classifier.predict(X) plot_confusion_matrix(y, outputs, range(0, 10)) print_classification_report(y, outputs) ## Save the model pickle.dump(classifier, open(clf_path, 'wb')) return classifier
def svm_ranking(train_docs, train_keys, test_docs, test_keys): X_train_vec, y_train_vec = extract_features(train_docs, train_keys) X_train, y_train = get_vec_differences_train(X_train_vec, y_train_vec) X_test_vec, y_test_vec = extract_features(test_docs, test_keys) X_test, y_test = get_vec_differences_train(X_test_vec, y_test_vec) print "--Training SVM" svm = train_svm(X_train, y_train) # The test_svm function needs to be replaced for this method # so it finds the diff. of test vectors, classifies those # differences, and ranks using those classifications print "--Testing SVM" accuracy = test_svm(svm, X_test, y_test) avg_recall = 0 avg_precision = 0 return {'accuracy': accuracy, 'recall': avg_recall, 'precision': avg_precision}
def main(): with warnings.catch_warnings(): warnings.simplefilter("ignore") args = parser.parse_args() clasf = Classifier() if args.parse: parse_all_files() from feature_extraction import main as extract_features extract_features() if args.train: folder_index = args.folder_index ratio = args.ratio if ratio <= 0 and ratio > 1: ratio = 1 if folder_index == 1 or folder_index == 2: if args.task == 1: clasf.train_NER_model(train_folder = folder_index, ratio = ratio, classifier = args.classifier) elif args.task == 2: clasf.train_DDI_model(train_folder = folder_index, ratio = ratio, classifier = args.classifier) else: parser.print_help() else: parser.print_help() elif args.test: model_index = args.model_index folder_index = args.folder_index if model_index >= 0 and folder_index >= 1: if args.task == 1: clasf.test_NER_model(model_index = model_index, test_folder = folder_index, classifier = args.classifier) elif args.task == 2: clasf.test_DDI_model(model_index = model_index, test_folder = folder_index, classifier = args.classifier) else: parser.print_help() else: parser.print_help() else: parser.print_help()
def _get_features(self): import warnings warnings.simplefilter("error") # Divide up into cars and notcars # Read in car and non-car images cars = glob.glob('training_data/vehicles/*/*.png') notcars = glob.glob('training_data/non-vehicles/*/*.png') print("Training data", "positive", len(cars), "negative", len(notcars)) car_features = extract_features( cars, cspace=self.hog_config.colorspace, orient=self.hog_config.orient, pix_per_cell=self.hog_config.pix_per_cell, cell_per_block=self.hog_config.cell_per_block, hog_channels=self.hog_config.hog_channels, spatial_size=self.hog_config.spatial_size, hist_bins=self.hog_config.hist_bins, hist_range=self.hog_config.hist_range) notcar_features = extract_features( notcars, cspace=self.hog_config.colorspace, orient=self.hog_config.orient, pix_per_cell=self.hog_config.pix_per_cell, cell_per_block=self.hog_config.cell_per_block, hog_channels=self.hog_config.hog_channels, spatial_size=self.hog_config.spatial_size, hist_bins=self.hog_config.hist_bins, hist_range=self.hog_config.hist_range) print("Training data after feature extraction", "positive", len(car_features), "negative", len(notcar_features)) # Create an array stack of feature vectors X = np.vstack((car_features, notcar_features)).astype(np.float64) # Define the labels vector y = np.hstack( (np.ones(len(car_features)), np.zeros(len(notcar_features)))) return X, y
def get_distances(keyword_image_path, normalize, comparison_words_folder): # extract the features from the given keyword to spot in the document keyword_features = extract_features(keyword_image_path, normalize) print("Calculating distances...") distances = [] for compared_word in glob.glob(comparison_words_folder + "/**/*" + ".png"): # for each word in each subfolder (OS dependant?) print(compared_word) # 1) calculate the feature vector for the second image word_features = extract_features(compared_word, normalize) # 2) calculate the dtw distance dtw_distance = get_dtw_distance(keyword_features, word_features) distances.append([os.path.basename(compared_word), dtw_distance]) sorted_distances = sorted(distances,key=lambda l:l[1]) return sorted_distances
def load_database(self): self.update_status('>>> Loading DB videos', clear=True) print('Started') print('=' * 80) print('Database video list') print('-' * 80) print('\n'.join( ['%d. %s' % (i + 1, f) for (i, f) in enumerate(self.folders)])) print('=' * 80) self.db_vids = [] for selected_folder in self.folders: self.update_status('>>> DB video selected: ' + selected_folder) pkl_path = glob.glob(os.path.join(selected_folder, '*.pkl')) if len(pkl_path) and not self.FORCE_CREATE: tic = time.time() self.update_status('>>> Loading pre-calculated features') with open(pkl_path[0], 'rb') as pkl_fp: v = pickle.load(pkl_fp) self.update_status('>>> Done. Time taken: %0.4fs' % (time.time() - tic)) else: tic = time.time() self.update_status('>>> Loading video') vid_path = selected_folder aud_path = glob.glob(os.path.join(selected_folder, '*.wav'))[0] v = Video(vid_path, aud_path) self.update_status('>>> Done. Time taken: %0.4fs' % (time.time() - tic)) # Computing features tic = time.time() self.update_status('>>> Calculating video features') extract_features(v) self.update_status('>>> Calculated in %0.4fs' % (time.time() - tic)) self.update_status('>>> Saving results to database') with open(os.path.join(selected_folder, '%s.pkl' % v.name), 'wb') as pkl_fp: pickle.dump(v, pkl_fp) self.db_vids.append(v) self.update_status('>>> Saved results to database')
def applyFeatures(fileTrain, fileTest, DataTestAfterFeature, fileCentroid, DataTrainAfterFeature): ##################### Création des centroid, puis chargement ################### #on extrait des centroid des data train extract_centroid(fileTrain, fileCentroid, 500) #on charge les centroid Fcentroid = open(fileCentroid, 'rb') listCentroid = cPickle.load(Fcentroid) print "fin extraction des centroids" #################### Application des centroids sur les données test et train ############ ###### application sur les données train ############### #on charge les data train data = unpickle(fileTrain) #on applique les features sur les data train dataTrain = extract_features(data['data'], listCentroid) #sauvegarde des data train après l'application des features pickle.dump(dataTrain, open(DataTrainAfterFeature, "wb")) print "fin application des centroids sur les données train " ###### application sur les données test ############### #on charge les data test dataT = unpickle(fileTest) #on applique les features sur les data train dataTest = extract_features(dataT['data'], listCentroid) #sauvegarde des data train après l'application des features pickle.dump(dataTest, open(DataTestAfterFeature, "wb")) print "fin application des centroids sur les données test " ################ chargement des données test et train ###################### """
def svm_ranking(train_docs, train_keys, test_docs, test_keys): X_train_vec, y_train_vec = extract_features(train_docs, train_keys) X_train, y_train = get_vec_differences_train(X_train_vec, y_train_vec) X_test_vec, y_test_vec = extract_features(test_docs, test_keys) X_test, y_test = get_vec_differences_train(X_test_vec, y_test_vec) print "--Training SVM" svm = train_svm(X_train, y_train) # The test_svm function needs to be replaced for this method # so it finds the diff. of test vectors, classifies those # differences, and ranks using those classifications print "--Testing SVM" accuracy = test_svm(svm, X_test, y_test) avg_recall = 0 avg_precision = 0 return { 'accuracy': accuracy, 'recall': avg_recall, 'precision': avg_precision }
def process_train_samples(samples, max_srch_size=10, each_saved_size=1000000): sorted_samples = samples.sorted_values(by=["srch_id"]) sorted_samples = sorted_samples.reset_index(drop=True) samples_in_one_srch = pd.DataFrame() for r_idx, sample in sorted_samples.iterrows(): if (r_idx + 1) % 1000 == 0: print "Processed %i sample of %i" % (r_idx + 1, sorted_samples.shape[0]) is_next_in_same_search = True samples_in_one_srch = pd.concat( (sample.to_frame().transpose(), samples_in_one_srch), axis=0) current_srch_id = sample["srch_id"] if (r_idx + 1) == sorted_samples.shape[0]: is_next_in_same_search == False else: next_srch_id = sorted_samples["srch_id"][r_idx + 1] if current_srch_id != next_srch_id: is_next_in_same_search = False if not is_next_in_same_search: ext_samples_in_one_srch = extract_features(samples_in_one_srch) n_samples = ext_samples_in_one_srch.shape[0] if n_samples > max_srch_size: if np.any(ext_samples_in_one_srch["bookings_bool"]): pos_samples = ext_samples_in_one_srch[ ext_samples_in_one_srch["booking_bool"] == 1] neg_samples = ext_samples_in_one_srch[ ext_samples_in_one_srch["booking_bool"] == 0] selected_neg_samples = neg_samples.samples( n=max_srch_size - pos_samples.shape[0]) selected_samples = pd.concat( (pos_samples, selected_neg_samples), axis=0) else: selected_samples = ext_samples_in_one_srch.sample( n=max_srch_size) else: selected_samples = ext_samples_in_one_srch.copy() processed_samples = pd.concat( (processed_samples, selected_samples), axis=0) samples_in_one_srch = pd.DataFrame() if (r_idx + 1) % each_saved_size == 0: save_file_name = "proc_train_samples_%i.csv" % (r_idx + 1) save_path = get_paths()["proc_train_path"] if not os.path.exists(save_path): os.makedirs(save_path) if np.any(np.isnan(processd_samples.values)): processd_samples = processd_samples.fillna(value=0) processd_samples.to_csv(os.path.join(save_path, save_file_name), index=None) save_file_name = "proc_train_samples%i.csv" % (r_idx + 1) save_path = get_paths()["proc_train_path"] if np.any(np.isnan(processd_samples.values)): processd_samples = processd_samples.fillna(value=0) processd_samples.to_csv(os.path.join(save_path, save_file_name), index=None)
def grid_search(): for step_size in range(8, 36, 4): for pixels_per_cell in range(3, 10, 1): for cells_per_block in range(3, 10, 1): args = config.Config() args.STEP_SIZE = [step_size, step_size] args.CELLS_PER_BLOCK = [cells_per_block, cells_per_block] args.PIXELS_PER_CELL = [pixels_per_cell, pixels_per_cell] args.PROJECT_ID = args.PROJECT_ID + "_SS_" + str( step_size) + "_CPB_" + str( cells_per_block) + "_PPC_" + str(pixels_per_cell) args.update_names() print(args.PROJECT_ID, args.DIR_PATHS) args.mk_new_dirs() feature_extraction.extract_features(args=args) train_classifier(args=args) test_classifier(args=args) if not args.KEEP_FEAT: shutil.rmtree(args.DIR_PATHS['NEG_FEAT_PH']) shutil.rmtree(args.DIR_PATHS['POS_FEAT_PH'])
def get_performances(file_path): model = utils.get_model(HERE / 'outputs/XGBoost.pkl') xgb_obj = xgb.XGBoost() xgb_obj.model = model df = utils.read_to_df(file_path) df = pre.preprocess(df) X = fe.extract_features(df, FEATURE_LIST) X = X.drop(columns=['id']) y = (df['cb_level'] == 3).astype(int) y_prob_rf = xgb_obj.predict(X) pred = np.where(y_prob_rf > 0.5, 1, 0) return per.get_performances(y, pred)
def __extract_for_training(imgs): """ Function that receives a list of image paths, load each image and calls single_img_features for each. Saves the features for all images and returns all features together. Is needed for the training of the SVM. """ features = [] for image_name in imgs: img = cv2.imread(image_name) features.append(fe.extract_features(img)) return features
def prepare_images(self, images): features = extract_features( images, cspace=self.hog_config.colorspace, orient=self.hog_config.orient, pix_per_cell=self.hog_config.pix_per_cell, cell_per_block=self.hog_config.cell_per_block, hog_channels=self.hog_config.hog_channels, hog_feature_vec=False) features = np.array(features).astype(np.float64) features = self.scaler.transform(features) return features
def get_distances(user_id, ver_sig_id, enrollment_files, verification_file, normalize=True): #get data in given verification file verification_data = np.loadtxt(verification_file) # compute features for given verification signature verification_features = extract_features(verification_data, normalize) #print("verification_features: ", verification_features) #print("Calculating distances...") distances = [] for ind in range(len(enrollment_files)): #get data from enrollment file en_data = np.loadtxt(enrollment_files[ind]) # compute features for enrollment signatures of given user enrollment_features = extract_features(en_data, normalize) dtw_distance = get_dtw_distance(verification_features, enrollment_features) distances.append(dtw_distance) print("distances: ", distances) #sorted_distances = sorted(distances,key=lambda l:l[3]) return distances
def classify(classifier, file_path): features = extract_features(file_path) fieldnames = pickle.load(open(fieldnames_array_path, 'r')) features_arr = [] for field in fieldnames: if field in features: features_arr.append(features[field]) else: features_arr.append(0) prediction = classifier.predict([features_arr]) return prediction
def train(epoch): # set train mode clas1.train() clas2.train() att1.train() att2.train() # for each batch for batch_idx, (data, targets) in enumerate(train_loader): # initialization _, target = targets # print(data.shape) data, (target) = Variable(data), Variable(target) if cuda: data, target = data.cuda(), target.cuda() opt1.zero_grad() opt2.zero_grad() # forward pass features = extract_features(fullres, data) attention_map1 = att1(features) attention_map2 = att2(features) (region1, region1_coord) = crop_region(features, attention_map1) (region2, region2_coord) = crop_region(features, attention_map2) out1 = clas1(region1) out2 = (clas2(region2) + out1) / 2 # loss loss1 = loss(out1, target) loss2 = loss(out2, target) loss_value = (loss1 + loss2) / 2 reward_loss = () # backward pass loss_value.backward() reward_grad = () # weight upgrade opt1.step() opt2.step() reward_opt() # log if batch_idx % log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss_value.data[0]))
def test_extract_features(): from feature_extraction import extract_features import numpy as np r = 5*np.ones([3, 3]) r[1, 1] = 0 g = np.zeros([3, 3]) b = [[ii for ii in range(0, 3)] for _ in range(0, 3)] img = np.zeros([3, 3, 3]) img[:, :, 0] = r img[:, :, 1] = g img[:, :, 2] = b actual, labels = extract_features(img, 'gb', 'r', 'rg', pct_yellow=True) expected = [np.median(g), np.median(b), np.std(r), 5, 0, 0] assert np.array_equal(actual, expected) actual, labels = extract_features(img, 'r', 'r', 'r', omit=0) expected = [5, 0, 5] assert np.array_equal(actual, expected)
def create_classifier(image_dir, out_path): X_true, all_X_false = [], [] img_path = file_paths(image_dir) for fname in img_path: seg = cv2.imread(fname) masks_true = get_masks_of_number(seg) masks_true = get_unique_masks(masks_true) X_true.extend(masks_true) masks_all = get_masks_of_segments(fill_black_pixels(seg)) masks_all_false = filter(lambda x: not array_is_contained(x, masks_true), masks_all) all_X_false.extend(masks_all_false) X_true = extract_features(X_true) all_X_false = extract_features(all_X_false) X_train = np.concatenate((X_true, all_X_false), axis=0) y_train = np.concatenate((np.ones(X_true.shape[0]), np.zeros(all_X_false.shape[0]))) n_true = sum(y_train) n_false = sum(y_train == 0) estimator = RandomForestClassifier(100, class_weight={1:n_true / (n_true + n_false), 0: n_false / (n_true + n_false)}) estimator.fit(X_train, y_train) joblib.dump(estimator, out_path)
def train_file(file_path): path_object = pathlib.Path(HERE / 'outputs') if path_object.exists(): shutil.rmtree(HERE / 'outputs') os.makedirs(HERE / 'outputs') tagged_df = utils.read_to_df(file_path) tagged_df = pre.preprocess(tagged_df) X = fe.extract_features(tagged_df, FEATURE_LIST) y = (tagged_df['cb_level'] == 3).astype(int) X = X.drop(columns=['id']) xgb_obj = xgb.XGBoost() xgb_obj.train(X, y) exp.explain_model(xgb_obj.model, X, False) utils.save_model(xgb_obj.model, os.path.join(HERE / 'outputs', 'XGBoost.pkl'))
def predict(self): # extract features from the existing skeleton frames, run prediction and print the top 5 predictions sample = np.array(self.skeletons) sample = np.swapaxes(sample, 0, 1) x = extract_features(sample) y_pred = self.predictor.predict_proba(x.reshape(1, -1)) sorted_ind = np.argsort(y_pred[0]) top_ind = np.flip(sorted_ind[-5:]) top_probs = np.around(y_pred[0, top_ind], decimals=4) self.currentActivity = _class_names[top_ind[0]] print('Prediction results: ') for i in range(len(top_ind)): print(_class_names[top_ind[i]] + ': ' + str(top_probs[i])) print('')
def vsumm_frames_in_memory(video): segmentation = vs.VideoSegmentation(video) frames = segmentation.read_and_keep_frames() if len(frames) == 0: return False features = feat.extract_features(frames) keyframes = cl.find_clusters(features) summary_folder = 'summaryM-'+video[7:-4] if not os.path.isdir(summary_folder): os.mkdir(summary_folder) for k in keyframes: frame = frames[k.frame_id-1] frame_name = summary_folder+'/frame-'+str(k.frame_id).zfill(6)+'.jpg' cv2.imwrite(frame_name,frame) return True
def load_or_train(X, y, force_train=False, enable_plot=False): """ Load an existing one or train a new SVM classifier, and return it. Once the classifier is trained, it is saved through pickle. """ clf_path = './clf.pkl' if not force_train and os.path.exists(clf_path): print "> loading from file..." clf = pickle.load(open(clf_path, 'rb')) print "> loaded" else: print "> training..." X = extract_features(X) ## Cross-validation cv = ShuffleSplit(len(X), n_iter=10, test_size=0.2, random_state=0) gammas = np.logspace(-6, -1, 10) ## Grid search clf = GridSearchCV(estimator=svm.SVR(), cv=cv, param_grid=dict(gamma=gammas), n_jobs=1, verbose=10) clf.fit(X, y) ## Plot learning curve title = 'Learning Curves (SVM, linear kernel, $\gamma=%.6f$)' % clf.best_estimator_.gamma estimator = svm.SVC(kernel='linear', gamma=clf.best_estimator_.gamma) plot_learning_curve(estimator, title, X, y, cv=cv) plt.show() ## show information about the training if enable_plot: outputs = clf.predict(X) plot_confusion_matrix(y, outputs, range(0, 10)) print_classification_report(y, outputs) ## Save the model pickle.dump(clf, open(clf_path, 'wb')) return clf
if __name__ == "__main__": # get trained network fnn = NetworkReader.readFrom(TRAINED_NN_FILE) # get data file_location = STEP_DETECTION_DATA_LOCATION + "/" accel_list = file.get_sensor_list(file_location + ACCEL_STEP_DETECTION_DATA) gyro_list = file.get_sensor_list(file_location + GYRO_STEP_DETECTION_DATA) compass_list = file.get_sensor_list(file_location + COMPASS_STEP_DETECTION_DATA) imu_list = sync_accel_gyro_compass(accel_list, gyro_list, compass_list) accel_y_dict = imu_list.extract_sensor_axis_list(ACCEL, Y_AXIS) accel_y = accel_y_dict[READINGS] timestamps = accel_y_dict[TIMESTAMPS] peaks_index = extract_peaks(accel_y) step_count = 0 index = 0 # while(index < len(peaks_index)-1): start = peaks_index[index] for j in range(index+1, len(peaks_index)): end = peaks_index[j] feature = extract_features(imu_list, start, end) result = fnn.activate(feature) print index, j, result
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ####################################################################################################################### import matplotlib.pyplot as pyplot from data_loader import load_numbers from feature_extraction import extract_features ## Load the numbers print "Loading numbers..." X, y = load_numbers() ## Extract the features print "Extracting features..." features = extract_features(X) print "Prepare data..." separated_by_classes = [[], [], [], [], [], [], [], [], [], []] for i, c in enumerate(y): separated_by_classes[c].append(X[i]) ## Box plot print "Creating boxplot..." boxplotElements = pyplot.boxplot(separated_by_classes)
# TODO: load experiment using params train_path = "" test_path = "" # loads images from given paths train = dataset.load_dataset(train_path) test = dataset.load_dataset(test_path) # extracts descriptors for train and test sets train_descriptors = {item.path:feature_extraction.extract_descriptors(item.data) for item in train} test_descriptors = {item.path:feature_extraction.extract_descriptors(item.data) for item in test} # creates codebook (default size=300) based on train samples codebook = feature_extraction.create_codebook(np.concatenate(train_descriptors.values())) # generate feature vectors for train and test based on previously calculated codebook train_features = {key:feature_extraction.extract_features(codebook, train_descriptors[key]) for key in train_descriptors} test_features = {key:feature_extraction.extract_features(codebook, test_descriptors[key]) for key in test_descriptors} # TODO: create a similarity matrix using all features # persists features, codebook and similarity matrix pickle.dump(train_features, open("train_features.pk", "wb")) pickle.dump(test_features, open("test_features.pk", "wb")) pickle.dump(codebook, open("codebook.pk", "wb")) # creates index using LSHash and train set searcher = Searcher(train_features) # persists index for future use pickle.dump(searcher, open("searcher.pk", "wb"))
from feature_extraction import extract_features ## Vars force_train = True enable_plot = False ## Load and split the data in train and test sets print "Loading data..." X, y = load_numbers() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) ## Get trained classifier print "Training classifier..." clf = load_or_train(X_train, y_train, force_train, enable_plot) print clf ## Compute the features of the test set and predict print "Predicting test set..." features = extract_features(X_test) y_pred = clf.predict(features) print y_test print y_pred ## Score f1 = f1_score(y_test, y_pred) print "f1-score for is {}%".format(f1) #if enable_plot: print_classification_report(y_test, y_pred) plot_confusion_matrix(y_test, y_pred, range(0, 10))
def process_train_samples(samples, max_srch_size=10, each_saved_size=1000000): ''' func: Process samples including feature extraction and downsampling MB:samples with the same srch_id that have one positive traget are treated as one positive sample,otherwise,are negative samples max_srch_size 就是每一个srch_id最多有几行数据, 比如 train数据集中,一个srch_id有20行,但是我们只随机取到10行就可以了,这就做了个downsampling 因为训练集很大,所以就每处理100万条,就生成一个文件,并训练 ''' # 训练集数据乱序,这里先拍下序,相同srch_id的数据放一块 sorted_samples = samples.sort_values(by=['srch_id']) # grou by srch_id sorted_samples = sorted_samples.reset_index(drop=True) # reset row index processed_samples = pd.DataFrame() samples_in_one_srch = pd.DataFrame() # for 循环处理的就是下一个srch_id是不是与上一个相同 for r_idx, sample in sorted_samples.iterrows(): if (r_idx + 1) % 1000 == 0: print "processed %i sample of %i " % (r_idx + 1, sorted_samples.shape[0]) is_next_in_same_search = True samples_in_one_srch = pd.concat((sample.to_frame().transpose(), samples_in_one_srch), axis=0) current_srch_id = sample['srch_id'] # 最后一行 if (r_idx + 1) == sorted_samples.shape[0]: is_next_in_same_search = False else: next_srch_id = sorted_samples['srch_id'][r_idx + 1] if current_srch_id != next_srch_id: is_next_in_same_search = False # 正好是一组srch_id ,进行特征提取 # 16G内存,跑了8小时,这部分处理速度慢 if not is_next_in_same_search: ## if next one is not in the same search process the samples in the same search # feature extraction for samples ext_samples_in_one_srch = extract_features(samples_in_one_srch) # downsample samples 同一个srch_id下有多少samples n_samples = ext_samples_in_one_srch.shape[0] # 比如设定为10,这里大于10 if n_samples > max_srch_size: # if too many samples in one search,do downsampling if np.any(ext_samples_in_one_srch['booking_bool']): # if this is a positive sample(1 exists in booking_bool) # 有预定酒店的数据 正样本需要留下了 pos_samples = ext_samples_in_one_srch[ext_samples_in_one_srch['booking_bool'] == 1] neg_samples = ext_samples_in_one_srch[ext_samples_in_one_srch['booking_bool'] == 0] # 然后在负样本里,随机选择 eg: samples 28条数据, 设定为10 ,有1条正样本,则在剩下的数据中随机选择9条 selected_neg_samples = neg_samples.sample(n=max_srch_size - pos_samples.shape[0]) selected_samples = pd.concat((pos_samples, selected_neg_samples), axis=0) else: # 没有正样本数据,就都随机选择了 # if this is a negative sample,random select max_srch_size selected_samples = ext_samples_in_one_srch.sample(n=max_srch_size) else: # selected_samples = ext_samples_in_one_srch.copy() processed_samples = pd.concat((processed_samples, selected_samples), axis=0) # create new samples for the next search samples_in_one_srch = pd.DataFrame() # 每100万条,存储下来 if (r_idx + 1) % each_saved_size == 0: # save samples for every each_saved_size save_file_name = 'proc_train_samples_%i.csv' % (r_idx + 1) save_path = get_paths()['proc_train_path'] if not os.path.exists(save_path): os.mkdir(save_path) if np.any(np.isnan(processed_samples.values)): # remove nan processed_samples = processed_samples.fillna(value=0) print "remove nan." processed_samples.to_csv(os.path.join(save_path, save_file_name), index=None) # out of loop save all processed samples save_file_name = 'proc_train_samples_%i.csv' % (r_idx + 1) save_path = get_paths()['proc_train_path'] if np.any(np.isnan(processed_samples.values)): # remove nan processed_samples = processed_samples.fillna(value=0) print "remove nan." processed_samples.to_csv(os.path.join(save_path, save_file_name), index=None)