def predict(input_path, output_path, start_extract, end_extract): tmp_dir = 'tmp_files/' tmp_features = 'tmp.features' tmp_prob = 'tmp.prob' tmp_prediction = 'tmp.prediction' if not os.path.exists(input_path): print >> sys.stderr, "wav file does not exits" return length = utils.get_wav_file_length(input_path) feature_file = tmp_dir + tmp_features prob_file = tmp_dir + tmp_prob predict_file = tmp_dir + tmp_prediction # remove tmo dir if exists if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) os.mkdir(tmp_dir) print '\n1) Extracting features and classifying ...' extract_features(input_path, feature_file, start_extract, end_extract) run(feature_file, prob_file) print '\n3) Extract Durations ...' post_process(prob_file, predict_file) print '\n4) Writing TextGrid file to %s ...' % output_path create_text_grid(predict_file, output_path, length, float(start_extract))
def main(argv): del argv # Unused. if FLAGS.evaluate: extract_features(FLAGS) else: wandb.init(config=FLAGS, sync_tensorboard=True) set_up_train(FLAGS)
def main(): """These are the main training settings. Set each before running this file.""" if (len(sys.argv) == 5): seq_length = int(sys.argv[1]) class_limit = int(sys.argv[2]) image_height = int(sys.argv[3]) image_width = int(sys.argv[4]) else: print ("Usage: python train.py sequence_length class_limit image_height image_width") print ("Example: python train.py 75 2 720 1280") exit (1) sequences_dir = os.path.join('data', 'sequences') if not os.path.exists(sequences_dir): os.mkdir(sequences_dir) checkpoints_dir = os.path.join('data', 'checkpoints') if not os.path.exists(checkpoints_dir): os.mkdir(checkpoints_dir) # model can be only 'lstm' model = 'lstm' saved_model = None # None or weights file load_to_memory = False # pre-load the sequences into memory batch_size = 10 nb_epoch = 50 data_type = 'features' image_shape = (image_height, image_width, 3) extract_features(seq_length=seq_length, class_limit=class_limit, image_shape=image_shape) train(data_type, seq_length, model, saved_model=saved_model, class_limit=class_limit, image_shape=image_shape, load_to_memory=load_to_memory, batch_size=batch_size, nb_epoch=nb_epoch)
def predict(input_path, output_path, start_extract, end_extract): tmp_dir = 'tmp_files/' tmp_features = 'tmp.features' tmp_prob = 'tmp.prob' tmp_prediction = 'tmp.prediction' if not os.path.exists(input_path): print >>sys.stderr, "wav file does not exits" return length = utils.get_wav_file_length(input_path) feature_file = tmp_dir+tmp_features prob_file = tmp_dir+tmp_prob predict_file = tmp_dir+tmp_prediction # remove tmo dir if exists if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) os.mkdir(tmp_dir) print '\n1) Extracting features and classifying ...' extract_features(input_path, feature_file, start_extract, end_extract) run(feature_file, prob_file) print '\n3) Extract Durations ...' post_process(prob_file, predict_file) print '\n4) Writing TextGrid file to %s ...' % output_path create_text_grid(predict_file, output_path, length, float(start_extract))
def main(argv): del argv # Unused. if FLAGS.evaluate: extract_features(FLAGS) else: wandb.init(config=FLAGS, sync_tensorboard=True, dir="/dfs/scratch2/prabhat8/cs236g/wandb") set_up_train(FLAGS)
def main(): """ Training and test settings for the algorithm.""" model_name = 'lstm' saved_model = None cls_lmt = None batch_size = 32 prompt = "Enter the sequence length" seq_length = np.int64(raw_input(prompt)) if seq_length == '': seq_length = None prompt = "Enter the maximum frame length" max_frames = np.int64(raw_input(prompt)) if max_frames == '': max_frames = None raise ValueError("Enter Valid interger for max frames") prompt = "Enter the model_name from the list \n 1.LSTM \n 2.CNN_LSTM" model_name = np.int64(raw_input(prompt)) if model_name == '': model_name = None if model_name == 'LSTM': data_type = 'features' img_shape = None # Feature Extraction extract_features(seq_length, max_frames, abspath=None) elif model_name == 'CNN_LSTM': data_type = 'images' img_shape = (100, 100, 3) else: raise ValueError( "Invalid Model Selected. Select from List given in the options") # Training the algorithm train(seq_length, model_name, saved_model, data_type, cls_lmt=None, img_shape=img_shape, batch_size=32, no_epoch=100) # Testing the algorithm saved_model = raw_input( "Enter the absolute path for the model saved after training") acc = test(model_name, saved_model, data_type, seq_length, cls_lmt=None, img_shape=None, batch_size=30) with open('Testfile.csv', 'a+') as f: op = model_name + ',' + str(seq_length) + ',' + str( max_frames) + ',' + saved_model + ',' + acc f.write(op) return
def train(): if request.method == 'POST': f = request.files.getlist("my_file[]") for i in f : i.save('raw/'+i.filename) makeinterim() makeprocessed() extract_features () report = train_pred() df = pd.DataFrame(report).transpose() return render_template('classification_report.html', tables=[df.to_html()])
def main(): foldername = 'data' X, X_test_kaggle, y, groups, lenc = load_data(foldername) X, X_test_kaggle = extract_features(X), extract_features(X_test_kaggle) classifiers = [ LinearDiscriminantAnalysis(), SVC(kernel='linear', probability=True), SVC(kernel='rbf', probability=True), LogisticRegression(), RandomForestClassifier(n_estimators=1000, max_depth=4), KNeighborsClassifier(), ExtraTreesClassifier(n_estimators=1000, max_depth=4) ] names = [ 'LDA', 'SVC(linear)', 'SVC(rbf)', 'Logistic Regression', 'Random Forest', 'KNeighbors', 'ExtraTrees' ] scores_by_clf = defaultdict(list) rs = GroupShuffleSplit(n_splits=100, test_size=0.2) for trindex, tsindex in rs.split(X, y, groups): X_train, y_train = X[trindex, :], y[trindex] X_test, y_test = X[tsindex, :], y[tsindex] print("Training set has classes: ", np.unique(y_train)) amount = [] for i in range(len(np.unique(y_train))): amount.append(len(np.argwhere(y_train == np.unique(y_train)[i]))) print(amount) print("Testing set has classes: ", np.unique(y_test)) amount = [] for i in range(len(np.unique(y_test))): amount.append(len(np.argwhere(y_test == np.unique(y_test)[i]))) print(amount) for i in range(len(classifiers)): clf = classifiers[i] pred, score, clf, proba = classify(clf, X_train, y_train, X_test, y_test, groups=groups) print(np.unique(pred)) print(names[i], 'score: %.3f \n' % score) scores_by_clf[names[i]].append(score) for clfname in scores_by_clf.keys(): print(clfname, np.mean(scores_by_clf[clfname]))
def main(args): print('===> args:\n', args) config_json = args.config_json image_dir = args.image_dir save_dir = args.save_dir image_list_file = args.image_list_file gpu_id = args.gpu extract_features(config_json, save_dir, image_list_file, image_dir, gpu_id=gpu_id)
def build_features(scale_features=False): film_list = [] feat_list = [] qual_list = [] count = 0 with open('imsdb_ratings_processed.csv', 'rb') as csvfile: __ = csvfile.readline() reader = csv.reader(csvfile) for row in reader: count += 1 print "Film No.: " + str(count) #if count == 10: # break #if get_status(row) == "Good" or get_status(row) == "Bad": file_name = file_base + row[0] + ".txt" print file_name scenes = format_script(file_name) if scenes is None: print "No file or formatting error" continue print "Num scenes: " + str(len(scenes)) if len(scenes) < 5: print "Skipping..." continue # Segment level features chunks = script_to_n_chunks(scenes) features = {} for idx, chunk in enumerate(chunks): # change input parameter to scenes for extract features try: chunk_features = extract_features(chunk, idx + 1) except: try: chunk_features = extract_features(chunk, idx + 1) except: print "Failed to extract features" continue features.update(chunk_features) # Full script features script_summary_features = get_summary_features(scenes, row[0]) final_features = {} final_features.update(features) final_features.update(script_summary_features) feat_list.append(final_features) #if get_status(row) == "Good": qual_list.append(row[4]) #else: #qual_list.append(0) film_list.append(row[0]) return film_list, feat_list, qual_list
def build_features(scale_features = False): film_list = [] feat_list = [] qual_list = [] count = 0 with open('imsdb_ratings_processed.csv', 'rb') as csvfile: __ = csvfile.readline() reader = csv.reader(csvfile) for row in reader: count += 1 print "Film No.: " + str(count) #if count == 10: # break #if get_status(row) == "Good" or get_status(row) == "Bad": file_name = file_base + row[0] + ".txt" print file_name scenes = format_script(file_name) if scenes is None: print "No file or formatting error" continue print "Num scenes: " + str(len(scenes)) if len(scenes) < 5: print "Skipping..." continue # Segment level features chunks = script_to_n_chunks(scenes) features = {} for idx, chunk in enumerate(chunks): # change input parameter to scenes for extract features try: chunk_features = extract_features(chunk, idx + 1) except: try: chunk_features = extract_features(chunk, idx + 1) except: print "Failed to extract features" continue features.update(chunk_features) # Full script features script_summary_features = get_summary_features(scenes, row[0]) final_features = {} final_features.update(features) final_features.update(script_summary_features) feat_list.append(final_features) #if get_status(row) == "Good": qual_list.append(row[4]) #else: #qual_list.append(0) film_list.append(row[0]) return film_list, feat_list, qual_list
def training(data, classifier, one_vs_all, apply_pca, tag_dict): ind_tags_phrases = individual_phrase_tags(data, tag_dict) ind_phrases, ind_tags, phrase_position = [], [], [] for i in ind_tags_phrases: ind_phrases.append(i[0].split(',')[1]) ind_tags.append(i[0].split(',')[0]) phrase_position.append(i[1]) print('train', len(ind_phrases)) train_features = extract_features(ind_phrases, phrase_position, 'training') write_features_labels('./', ind_phrases, train_features, ind_tags, 'train_features') pca, scaler = '', '' if apply_pca == 'y': scaler = StandardScaler() scaler.fit(train_features) train_features = scaler.transform(train_features) pca = PCA(0.95) pca.fit(train_features) train_features = pca.transform(train_features) model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100), n_jobs=-1) model.fit(train_features, ind_tags) pred_tags = model.predict(train_features) accuracy = metrics.accuracy_score(ind_tags, pred_tags) return model, accuracy, pca, scaler
def evaluate_windows_of_size(self, windows, window_size): X = [] for w in windows: window_yuv = self.cropped_frame_yuv[int(w.y1):int(w.y2), int(w.x1):int(w.x2)] if w.width() != window_size or w.height() != window_size: window_yuv = cv2.resize(window_yuv, (window_size, window_size)) ppc = 8 if self.use_hires_classifier else 16 X.append(extract_features(window_yuv, window_size, ppc=ppc)) X = np.array(X) windows = np.array(windows) normalized_feature_vector = self.scaler[window_size].transform(X) score = self.classifier[window_size].predict(normalized_feature_vector) pos_window_indexes = np.where(score == 1.0)[0] pos_windows = windows[pos_window_indexes] pos_window_scores = score[pos_window_indexes] result = [] self.evaluated_windows.extend(windows) for r, score in zip(pos_windows, pos_window_scores): result.append((r, score)) if self.save_false_positives and self.is_false_positive_candidate( r): window_img = crop_img(self.cropped_frame, r.x1, r.y1, r.x2, r.y2) save_img( window_img, "%s/%d/%04d-%04d" % (self.false_positive_dir_name, window_size, self.frame_count, self.false_positive_count)) self.false_positive_count += 1 return result
def BoltMotionObjToFeatureObj(all_bolt_data): """ Pull out PCA components from all data For each object - pull out features and store in feature_obj with the same structure as all_bolt_data Dictionary - "tap", "slide", "slow_slide", "thermal_hold", "squeeze" """ # DO PCA Calculations here # Store in feature class object all_features_obj_dict = dict(); for motion_name in all_bolt_data: trial_list = all_bolt_data.get(motion_name) print motion_name feature_list = list() # For all objects for trial in trial_list: bolt_feature_obj = extract_features.extract_features(trial) feature_list.append(bolt_feature_obj) # Store all of the objects away all_features_obj_dict[motion_name] = feature_list return all_features_obj_dict
def compute_probability_vector(self, bolt_obj): if bolt_obj.state == bolt_obj.TAP: # Store results as they come in self.adjective_vectors = dict() self.all_motion_results = dict() # Store dictionary of strings self.state_string = {bolt_obj.DISABLED:'disabled', bolt_obj.THERMAL_HOLD:'thermal_hold', bolt_obj.SLIDE:'slide', bolt_obj.SQUEEZE:'squeeze', bolt_obj.TAP:'tap', bolt_obj.DONE:'done', bolt_obj.SLIDE_FAST:'slide_fast', bolt_obj.CENTER_GRIPPER:'center_gripper' } current_motion = self.state_string[bolt_obj.state] # Create feature vector self.bolt_object = bolt_obj utilities.normalize_data(self.bolt_object) self.bolt_feature_object = extract_features.extract_features(self.bolt_object, self.pca_model[current_motion]) # Create a dictionary to store the results in for adj in self.all_classifiers: results, prob = utilities.compute_adjective_probability_score(self.all_classifiers[adj], self.bolt_feature_object, self.feature_list, adj, self.scaler_dict) # Store off adjective probabilities for ensemble if adj in self.adjective_vectors: pass else: self.adjective_vectors[adj] = list() self.adjective_vectors[adj].append(prob) # Store classifier score based on best motion best_motion = self.best_motion_dict[adj][1] if current_motion == best_motion: rospy.loginfo("Best Motion is: %s" % best_motion) self.all_motion_results[adj] = results print len(self.adjective_vectors[adj]) if len(self.adjective_vectors[adj]) == 5: ensembled_results = dict() print self.adjective_vectors #for adj in self.adjective_vectors: # ensembled_results[adj] = self.ensemble_classifiers[adj].predict(self.adjective_vectors[adj]) # Store off the adjectives that returned true adjectives_found = [] for adj in self.all_motion_results: if self.all_motion_results[adj] == 1: adjectives_found.append(adj) print "Results from max classification" print self.all_motion_results print str(adjectives_found) self.adjectives_pub.publish(str(adjectives_found))
def BoltMotionObjToFeatureObj(all_bolt_data, electrode_pca_dict): """ For each object - pull out features and store in feature_obj with the same structure as all_bolt_data Dictionary - "tap", "slide", "slow_slide", "thermal_hold", "squeeze" """ # Store in feature class object all_features_obj_dict = dict(); for motion_name in all_bolt_data: trial_list = all_bolt_data.get(motion_name) print motion_name feature_list = list() # For all objects for trial in trial_list: bolt_feature_obj = extract_features.extract_features(trial, electrode_pca_dict[motion_name]) feature_list.append(bolt_feature_obj) # Store all of the objects away all_features_obj_dict[motion_name] = feature_list return all_features_obj_dict
def cascaded_retrieval(queries, ord_features, feature_dict, k_list, metric_dict): queries = preprocess_sketches(queries) results = [] for query in queries: img_indices = np.arange(len(feature_dict[ord_features[0]])) for i, feature in enumerate(ord_features): sketch_features = extract_features(feature, query) if feature != "sift": image_features = np.asarray(feature_dict[feature]) image_features = image_features[img_indices] sketch_features = np.array(sketch_features).reshape(1, -1) # print(f"Image: {np.array(image_features).shape}") # print(f"Sketch: {np.array(sketch_features).shape}") else: image_features = np.asarray(feature_dict[feature]) image_features = [image_features[idx] for idx in img_indices] sketch_features = [sketch_features] distances = compute_distances(image_features, sketch_features, metric_dict[feature]) top_results = get_top_results(k_list[i], distances) img_indices = img_indices[top_results[0]] results.append(img_indices) return results
def train_model(): for files in file_paths: # Each speaker will have 1 features array features = np.asarray(()) for filepath in files: print("Training: " + filepath) try: sr, audio = read(filepath) vector = extract_features(audio, sr) except Exception as e: print(e) continue if features.size == 0: features = vector else: try: features = np.vstack((features, vector)) except: print("ValueError: Shape does not match") # gmm gmm = GMM(n_components=16, max_iter=200, covariance_type='diag', n_init=3) gmm.fit(features) # export trained model picklefile = model + os.path.basename(filepath).split('_')[0] + ".gmm" with open(picklefile, 'wb') as gmm_file: pickle.dump(gmm, gmm_file) print('successfully modeling for speaker:', picklefile, " with data point = ", features.shape)
def process_file(img): """process single image, extract features and emotions the process create the user model as well. Args: img (EmotionalImage): image data """ assert isinstance(img, Image) if str(img.name).find('json') > -1: return user = get_user(os.path.join(img.path, 'meta.json')) filePath = os.path.join(img.path, img.name) # logging.info("---------------Processsing----------------", img.name) print("---------------Processsing----------------", img.name) try: features = extract_features(filePath) except: logging.exception('Somthing went wrong with feature extraction') return try: emotions = predict_emotions(features) except: logging.exception('Somthing went wrong with emotions extraction') return uuid1 = uuid.uuid4() emImage = EmotionalImage(uuid1, img.name, img.path, features, emotions, "", "", "") # TODO: fix that, currently add one image at a time, not a functional issue but can be imroved. user.images.append(emImage) user.save()
def compare_features(filepath): bboxes, face_features = extract_features(filepath) if len(bboxes) == 0: return [] names, allfeatures = load_features() results = [] for bbox, feature in zip(bboxes, face_features): min = sys.float_info.max max = 0.0 nearest = None furthest = None for n, f in zip(names, allfeatures): dist = scipy.spatial.distance.cosine(feature, f) if dist < min: nearest = n min = dist if dist > max: furthest = n max = dist results.append({ 'face': bbox, 'nearest': { 'name': nearest, 'distance': min }, 'furthest': { 'name': furthest, 'distance': max } }) return results
def BoltMotionObjToFeatureObj(all_bolt_data, electrode_pca_dict): """ For each object - pull out features and store in feature_obj with the same structure as all_bolt_data Dictionary - "tap", "slide", "slow_slide", "thermal_hold", "squeeze" """ # Store in feature class object all_features_obj_dict = dict() for motion_name in all_bolt_data: trial_list = all_bolt_data.get(motion_name) print motion_name feature_list = list() # For all objects for trial in trial_list: bolt_feature_obj = extract_features.extract_features( trial, electrode_pca_dict[motion_name]) feature_list.append(bolt_feature_obj) # Store all of the objects away all_features_obj_dict[motion_name] = feature_list return all_features_obj_dict
def train(data_folder=TRAIN_FOLDER, labels_file=LABELS_FILE, model_prefix=MODEL_PREFIX, cdf_prediction=PREDICT_CDF, parameters=PARAMETERS): """Train ensemble.""" random.seed(SEED) _, sys_lab, dia_lab, features = \ extract_features.extract_features(data_folder, labels_file) assert len(sys_lab) == len(dia_lab) == features.shape[0] print 'total data', features.shape[0] print 'num features', features.shape[1] random.seed(SEED) test_prop = TEST_PROP test_size = int(features.shape[0] * test_prop) shuffled_indices = random.shuffle(range(features.shape[0])) test_sys_lab = sys_lab[:test_size] test_dia_lab = dia_lab[:test_size] test_features = features[:test_size] train_sys_lab = sys_lab[test_size:] train_dia_lab = dia_lab[test_size:] train_features = features[test_size:] print 'train size', train_features.shape[0] print 'test size', test_features.shape[0] systole_prefix = '%s_sys' % model_prefix diastole_prefix = '%s_dia' % model_prefix print 'training systole model' _train(train_features, train_sys_lab, test_features, test_sys_lab, model_prefix=systole_prefix, cdf_prediction=cdf_prediction, parameters=parameters) print 'training diastole model' _train(train_features, train_dia_lab, test_features, test_dia_lab, model_prefix=diastole_prefix, cdf_prediction=cdf_prediction, parameters=parameters) print 'final systole evaluation on test set' sys_crps = evaluate(test_features, test_sys_lab, model_prefix=systole_prefix, cdf_prediction=cdf_prediction) print 'final diastole evaluation on test set' dia_crps = evaluate(test_features, test_dia_lab, model_prefix=diastole_prefix, cdf_prediction=cdf_prediction) print 'test sys crps', sys_crps print 'test dia crps', dia_crps print 'overall test crps', (sys_crps + dia_crps) * 0.5
def evaluate_window(self, window): window_size = window.height() window_yuv = self.cropped_frame_yuv[int(window.y1):int(window.y2), int(window.x1):int(window.x2)] X = np.array(extract_features(window_yuv, window_size)) normalized_feature_vector = self.scaler[window_size].transform(X) return self.classifier[window_size].predict( normalized_feature_vector)[0]
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA): global quota global clusterizer global df global classificator global toexist global standartizator quota = [] y = np.empty(1000) threshold = 90 nclusters = 15 df = pd.DataFrame() df = df.append(exf.extract_features(QLINK_URLS), ignore_index=True) df = df.append(exf.extract_features(UNKNOWN_URLS), ignore_index=True) cnts = df.count() df = df.fillna(0) toexist = [] for i in xrange(len(cnts)): if cnts[i] > threshold: toexist.append(df.columns[i]) y[:500] = 1 y[500:] = 0 X = df[toexist].values # X1 = TSNE().fit_transform(X) # plt.scatter(X1[:, 0], X1[:, 1], c=y*8, cmap=plt.cm.get_cmap("jet", 10), s=1) # plt.colorbar(ticks=range(10)) # plt.clim(-0.5, 9.5) # plt.show() standartizator = StandardScaler() X = standartizator.fit_transform(X) clusterizer = KMeans(n_clusters=nclusters) clusterizer.fit(X) #classificator = LDA(solver='lsqr').fit(X, y) #classificator = SVC() classificator = KNeighborsClassifier() classificator.fit(X, y) qlInCluster = [] for i in xrange(nclusters): qlInCluster.append(sum(y[clusterizer.labels_ == i])) quota.append(90 * qlInCluster[i] + QUOTA / 100)
def daily_batch(): print "**** [Step 1] Clean old data and prepare directories ****" try: shutil.rmtree('data/') os.makedirs('data/') os.makedirs('output/') except: pass print "**** [Step 2] Crawl news articles ****" grd.get_raw_data() print "**** [Step 3] Compute features for each news articles ****" ef.extract_features(mode="batch") print "**** [Step 4] Compute the related news ****" print "(may take long time)" fv, id_list = grn.get_feature_vectors(mode="batch") grn.ANN(fv, id_list) print "**** [Step 5] Loading data to Redis ****" ftr.load_data(mode="batch") print "DONE!"
def input_fn(): # Load and parse dataset dataset = tf.data.TFRecordDataset(filename, compression_type='GZIP') corpus = dataset.map(exp_TFR.decode, num_parallel_calls=8) corpus = corpus.shuffle(5000000) # Build the dictionary # Extract the top 60000 most common words to include in our embedding vector vocab_file_path = "Dataset/Vocabulary/vocabulary.txt" vocab_size = 60000 # Gather together all the unique words and index them with a unique integer value # Loop through every word in the dataset and assign it to the unique integer word identified. # Any words not within the top 60000 most common words will be marked with "-1" and replace with "UNK" token # Load the dictionary populated by keys corresponding to each unique word table = tf.contrib.lookup.index_table_from_file( vocabulary_file=vocab_file_path, vocab_size=vocab_size, key_column_index=1, delimiter=' ') # Create a reverse_table that allows us to look up a word based on its unique integer identifier, # rather than looking up the identifier based on the word. # reverse_table = tf.contrib.lookup.index_to_string_table_from_file(vocabulary_file=vocab_file_path, # vocab_size=vocab_size, # value_column_index=1, # delimiter=' ') # Load ocean dictionary ocean_dict_file_path = "Dataset/Vocabulary/ocean_dict_filtered.txt" ocean_dict_size = 634 # 636 before (deleted 2 adjective) # Ocean lookup-table ocean_table = tf.contrib.lookup.index_table_from_file( vocabulary_file=ocean_dict_file_path, vocab_size=ocean_dict_size, key_column_index=0, delimiter='\t') # Extract labels and features and generate dataset dataset = corpus.map( lambda ids, text: extract_features(ids, text, table, ocean_table), num_parallel_calls=8) # Delete all the sentences without adjective # dataset = dataset.filter(lambda features, ocean_vector: tf.reduce_all(tf.is_finite(ocean_vector))) dataset = dataset.filter(lambda features, ocean_value: tf.reduce_all( tf.is_finite(ocean_value))) dataset = dataset.batch(batch_size=500) return dataset
def main(): """These are the main training settings. Set each before running this file.""" #os.environ['CUDA_VISIBLE_DEVICES'] = "0" start = time.time() os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" if (len(sys.argv) == 5): seq_length = int(sys.argv[1]) class_limit = int(sys.argv[2]) image_height = int(sys.argv[3]) image_width = int(sys.argv[4]) else: print ("Usage: python train.py sequence_length class_limit image_height image_width") print ("Example: python train.py 75 2 720 1280") exit (1) sequences_dir = os.path.join('data', 'sequences') if not os.path.exists(sequences_dir): os.mkdir(sequences_dir) checkpoints_dir = os.path.join('data', 'checkpoints') if not os.path.exists(checkpoints_dir): os.mkdir(checkpoints_dir) # model can be only 'lstm' model = 'lstm' saved_model = None # None or weights file load_to_memory = False # pre-load the sequences into memory batch_size = 16 nb_epoch = 50 data_type = 'features' image_shape = (image_height, image_width, 3) extract_features(seq_length=seq_length, class_limit=class_limit, image_shape=image_shape) train(data_type, seq_length, model, saved_model=saved_model, class_limit=class_limit, image_shape=image_shape, load_to_memory=load_to_memory, batch_size=batch_size, nb_epoch=nb_epoch) print('time required {:0.3f}'.format(time.time() - start))
def main(): # Load the data, get features, scale data foldername = 'data' X, X_test_kaggle, y, groups, lenc = load_data(foldername) X, X_test_kaggle = extract_features(X), extract_features(X_test_kaggle) print('the shape of X: ' + str(X.shape)) scaler = StandardScaler() X = scaler.fit_transform(X) X_test_kaggle = scaler.transform(X_test_kaggle) classifiers = [RandomForestClassifier(n_estimators=500, max_depth=4)] predictions, scores, classifiers, probabilities = classify_multi(classifiers, X, y, X_test_kaggle) print(predictions) # Submission stuff labels = lenc.inverse_transform(predictions[:,0].astype(int)) with open('results/submission.csv', "w") as fp: fp.write("# Id,Surface\n") for i in range(len(labels)): fp.write("%d,%s\n"%(i, labels[i]))
def predict(wav_path, textgrid_path, start_extract, end_extract): tmp_feature_file = generate_tmp_filename('features') tmp_prob_file = generate_tmp_filename('prob') tmp_predict_file = generate_tmp_filename('prediction') if not os.path.exists(wav_path): print >> sys.stderr, "wav file %s does not exits" % wav_path return length = utils.get_wav_file_length(wav_path) print '\n1) Extracting features and classifying ...' extract_features(wav_path, tmp_feature_file, start_extract, end_extract) run(tmp_feature_file, tmp_prob_file) print '\n3) Extract Durations ...' post_process(tmp_prob_file, tmp_predict_file) print '\n4) Writing TextGrid file to %s ...' % textgrid_path create_text_grid(tmp_predict_file, textgrid_path, length, float(start_extract)) # remove leftovers os.remove(tmp_feature_file) os.remove(tmp_prob_file) os.remove(tmp_predict_file)
def main(): import sys classifier = cv2.SVM() classifier.load(sys.argv[1]) im = cv2.imread(sys.argv[2]) im = cv2.GaussianBlur(im, (3, 3), 0) imshow_large(__file__, im) cv2.waitKey() unrotated = unrotate(im) # # Check if image is right way up, correct otherwise # imshow_large(__file__, unrotated) key = cv2.waitKey() if key & 0xff == ord("r"): unrotated = cv2.flip(cv2.flip(unrotated, 0), 1) imshow_large(__file__, unrotated) cv2.waitKey() binarized = binarize(unrotated) rois = get_rois(binarized) results = {} grayscale = cv2.cvtColor(unrotated, cv.CV_BGR2GRAY) for (x, y, width, height) in rois: roi = grayscale[y:y + height, x:x + width] vec = extract_features(roi) label = classifier.predict(vec) results[(x, y, width, height)] = "01234567890X"[int(label)] scale = SCREEN_HEIGHT / unrotated.shape[0] unrotated = cv2.cvtColor(grayscale, cv.CV_GRAY2BGR) scaled = cv2.resize(unrotated, (0, 0), fx=scale, fy=scale) for roi in rois: x = int(roi[0] * scale) y = int(roi[1] * scale) width = int(roi[2] * scale) height = int(roi[3] * scale) cv2.rectangle(scaled, (x, y), (x + width, y + height), (0, 255, 0, 0), 1) if results[roi] == "X": continue cv2.putText(scaled, results[roi], (x, y), cv.CV_FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0, 0)) cv2.imshow(__file__, scaled) cv2.waitKey()
def test_classifier_parameters(classifier = 'classifier_name', infile = 'features.npz', subcats=True, norm = False): """ main function to determine number of features to be used for classification and classifier hyperparameters classifier - classifier to be used, Naive Bayes or LDA infile - .csv file with the extracted features norm - Boolean parameter whether to normalize the Naive Bayes for unbalanced class sizes """ # extract numpy arrays, lists and dictionaries from the features.npz extract_features(subcats=subcats) features_file = np.load(infile) features, featurenames, categoryids, categories = features_file['features'], features_file['featurenames'], \ features_file['categoryids'], features_file['categories'].item() labels = categoryids[0,:] features = features.T categoryids = categoryids.T if classifier == 'Naive Bayes': # if True, do the normalization for unbalanced class sizes if norm: features = normalize(features, categoryids, categories) classify('nb', features, labels, categories) else: classify('lda', features, labels, categories)
def main(): import sys classifier = cv2.SVM() classifier.load(sys.argv[1]) im = cv2.imread(sys.argv[2]) im = cv2.GaussianBlur(im, (3,3), 0) imshow_large(__file__, im) cv2.waitKey() unrotated = unrotate(im) # # Check if image is right way up, correct otherwise # imshow_large(__file__, unrotated) key = cv2.waitKey() if key & 0xff == ord("r"): unrotated = cv2.flip(cv2.flip(unrotated, 0), 1) imshow_large(__file__, unrotated) cv2.waitKey() binarized = binarize(unrotated) rois = get_rois(binarized) results = {} grayscale = cv2.cvtColor(unrotated, cv.CV_BGR2GRAY) for (x,y,width,height) in rois: roi = grayscale[y:y+height,x:x+width] vec = extract_features(roi) label = classifier.predict(vec) results[(x,y,width,height)] = "01234567890X"[int(label)] scale = SCREEN_HEIGHT/unrotated.shape[0] unrotated = cv2.cvtColor(grayscale, cv.CV_GRAY2BGR) scaled = cv2.resize(unrotated, (0,0), fx=scale, fy=scale) for roi in rois: x = int(roi[0]*scale) y = int(roi[1]*scale) width = int(roi[2]*scale) height = int(roi[3]*scale) cv2.rectangle(scaled, (x,y), (x+width, y+height), (0,255,0,0), 1) if results[roi] == "X": continue cv2.putText(scaled, results[roi], (x, y), cv.CV_FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0, 0)) cv2.imshow(__file__, scaled) cv2.waitKey()
def testing(data, model, pca, scaler, apply_pca): ind_tags_phrases = individual_phrase_tags(data) ind_phrases, ind_tags, phrase_position = [], [], [] for i in ind_tags_phrases: ind_phrases.append(i[0].split(',')[1]) ind_tags.append(i[0].split(',')[0]) phrase_position.append(i[1]) test_features = extract_features(ind_phrases, phrase_position, 'testing') if apply_pca == 'y': test_features = scaler.transform(test_features) test_features = pca.transform(test_features) pred_tags = model.predict(test_features) accuracy = metrics.accuracy_score(ind_tags, pred_tags) return accuracy
def fetch_url(url): global quota global clusterizer global df global standartizator d = exf.extract_features([url], toexist)[0].values() d = standartizator.transform([d])[0] cls = clusterizer.predict([d])[0] if classificator.predict_proba([d])[0][1] > 0.7: quota[cls] -= 1 return True if quota[cls] > 0: quota[cls] -= 1 return True else: return False
def get_test_data(self): # loads the features array file_list = os.listdir(os.path.join(self.test_path, 'video')) # with open(os.path.join(self.test_path, 'testing.txt')) as testing_file: # lines = testing_file.readlines() # file_name = lines[self.num].strip() file_name = file_list[self.num] path = os.path.join(self.test_path, 'feat', file_name + '.npy') if os.path.exists(path): f = np.load(path) else: model = extract_features.model_cnn_load() f = extract_features.extract_features(file_name, model) if self.num < len(file_list): self.num += 1 else: self.num = 0 return f, file_name
def calculateFoldability(seq,sec_str): #---------------------------# #seq = 'GUAAGUCGGGGACCUCUUAAGAUGAGAGACUUCUGAACCGGGUCAGGAUCGGAAGAUAGCAGCCCUAAGGAAAGGCCUUUUGUGCUAAGAGUCUUCUCUGACUUAC' #sec_str = '..(.((((((((.(((((.(((((((((..((((.....((((....(((....)))....))))...))))....))))))).)))))))...)))))))))...' ##-- ------------------------- #---------------------------# real_RNA_loc = "RNASTRAND_real_feature_space.csv" folder_simulation_result = "/home/dhrumil/Desktop/Lab/RNAWebsite_v1/RNASTRAND_extract_feature/" #---------------------------# df_pred = extract_features.extract_features(seq,sec_str) clf = bulid_model(real_RNA_loc,folder_simulation_result) foldability = pred_foldability(df_pred,clf) fe,mfe,mfe_str = pred_fe(seq,sec_str) print('sequence:',seq) print('secondary structure:',sec_str) return foldability,fe,mfe,mfe_str
def validate(data_folder=VALIDATE_FOLDER, model_prefix=MODEL_PREFIX, cdf_prediction=PREDICT_CDF): """Use ensemble to predict CDFs on validation set.""" study_ids, sys_lab, dia_lab, features = \ extract_features.extract_features(data_folder) assert study_ids == tuple(range(501, 701)) assert len(sys_lab) == len(dia_lab) == features.shape[0] print 'total data', features.shape[0] print 'num features', features.shape[1] systole_prefix = '%s_sys' % model_prefix diastole_prefix = '%s_dia' % model_prefix systole_cdfs = predict_cdfs(features, systole_prefix, cdf_prediction) diastole_cdfs = predict_cdfs(features, diastole_prefix, cdf_prediction) return systole_cdfs, diastole_cdfs
#print "\tProcessing: " + names[i*2][0]; f1 = INPUT_PATH + names[i*2][2] f2 = INPUT_PATH + names[i*2+1][2] name = names[i*2][0]; out = CHECK_PATH + name + '.fea.res'; check_f = CHECK_PATH + name + '.fea'; if name not in test_result: test_result[name] = (0,0,0, False); if os.path.exists(check_f) != True: print >> sys.stderr, "Output test file was not created"; continue; if os.path.exists(out): os.remove(out); t1 = time.time() extract_features.extract_features(f1, f2, out); t2 = time.time() if os.path.exists(out)!= True: print >> sys.stderr, "Output test file was not created"; continue; result = read_feas(out) pattern = read_feas(check_f) t = test_result[name][0] +(t2-t1); w = test_result[name][1] + compare_results(result, pattern); c = test_result[name][2] + float(len(result)) / float(len(pattern)); test_result[name] = (t, w, c , True ); count_fails=0; for name in test_result.keys():
model = model_from_json(json_string) model.load_weights("model_weights/super_awesome_merged_model_weights.hdf5") model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'] ) # Parse song if len(sys.argv) < 2: print("missing parameter") else: filename = sys.argv[1] song_folder = os.path.dirname(os.path.realpath(filename))#should get the directory to the file if os.path.isdir(filename): batch_thirty_seconds(song_folder) extract_features(song_folder) else: thirty_seconds(filename) print("File split. Now extracting features.") extract_features(song_folder) print("Extracted features.") keyword_2 = "mfcc_coefficients" x2 = [] for root, dirs, files in os.walk(song_folder, topdown=False): for name in files: if re.search(keyword_2+".csv",name): song_path = (os.path.join(root,name)) song_features = genfromtxt(song_path, delimiter=",")
# Train with 3/4 of the sample + 1/4 of the error # error_count = sum([e.shape[0] for e in error]) keep_count = max(error_count * 3, npts_sampled - error_count) if coords.shape[0] > keep_count: coords = coords[r.permutation(coords.shape[0])[:keep_count], :] coords = np.vstack([coords] + error) p = r.permutation(coords.shape[0])[:npts_sampled] coords = coords[p, :] coords = coords[np.lexsort(coords.transpose())] if ("training_features" in tiffcvt.h5_file.keys() and (tiffcvt.h5_file["training_features"].shape[1] != extract_features.n_features or tiffcvt.h5_file["training_features"].shape[0] != npts_sampled)): del tiffcvt.h5_file["training_features"] del tiffcvt.h5_file["training_classification"] tf = tiffcvt.h5_file.require_dataset("training_features", (npts_sampled, extract_features.n_features), np.float32) tc = tiffcvt.h5_file.require_dataset("training_classification", (npts_sampled, ), np.uint32) for i in range(0, coords.shape[0], 1024): my_slice = slice(i, min(i+1024, coords.shape[0])) tf[my_slice,:] = extract_features.extract_features( img, blur_img, coords[my_slice,:]) tc[my_slice] = labels[coords[my_slice,0], coords[my_slice,1], coords[my_slice,2]] print "Finished %d of %d" % (i+1024, npts_sampled) tiffcvt.h5_file.close()
def align_and_tune(self): extract_features(self.original_audio_folder, self.original_features_folder, [features[0]], "jams") tune_wavs(self.original_audio_folder, self.tuned_audio_folder, self.original_features_folder)
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', default=False, help='turn on verbose message output') options = parser.parse_args() # establish MongoDB connection collection = myutils.get_mongodb_collection(options.hosts, options.database) # load models for each label models = test.load_models(collection['models'], ast.literal_eval(options.model)) cursor = myutils.get_mysql_connection(options.host, options.db).cursor() # contruct the testing set from the MediaWiki table vectors = [] for ent in wikilove_revs.get_entries(cursor, options.start, options.end, options.window, options.limit, newest=True): features = extract_features.extract_features({'entry': {'content': {'added': [ent.others.message], 'removed':[]}, 'comment': ''}}) vector = myutils.map_key_dict(int, extract_features.extract_vector(features, options.bits)) if ent.receiver_id != ent.sender_id: vectors.append(myutils.entry_t(ent, features, vector)) labels = sorted(models.keys()) vecs = [x.vector for x in vectors] predictions = [[[] for y in xrange(0, len(labels))] for x in xrange(0,len(vectors))] for (n,lname) in enumerate(labels): lab,_,val = liblinear.linearutil.predict([0]*len(vecs), vecs, models[lname], '-b 1') for (i,(pred,score)) in enumerate(zip(lab,val)): predictions[i][n] = score[1] # get the confidence for the label being 'True' print >>options.output, '<style type="text/css">.prediction{text-align: right;} td{vertical-align: top;} li{border: 1px solid; list-style: none inside; margin: 0.2em;} ul{padding: 0;} blockquote{ font: normal italic 100% serif; }</style>' print >>options.output, '<body style="background: #EEE;">Generated at %s.' % str(datetime.now())
def realign_and_separate_and_analyze(self): extract_features(self.tuned_audio_folder, self.tuned_features_folder, [features[0]], "jams") separate_channels(self.tuned_audio_folder, self.channels_audio_folder) copy_features_of_separated_channels(self.tuned_features_folder, self.channels_features_folder) extract_features(self.channels_audio_folder, self.channels_features_folder, features[1:], "jams")