def data_normalize(): for row in train_rows: row[3] = feas.normalize(row[3]) #for i in range(len(row[3])): # row[3][i] *= fea_weight[i] for row in test_rows: row[3] = feas.normalize(row[3])
def predict(): """ Predict sleep vs. non-sleep """ start_time = time.time() device_uuid, date_time = request.args.get('deviceUUID'), request.args.get( 'datetime') model_filename = get_model_filename(device_uuid, date_time) with open(model_filename, 'rb') as f: clf = pickle.load(f) baseline_filename = get_baseline_filename(device_uuid, date_time) with open(baseline_filename, 'rb') as f: baseline = pickle.load(f) data_filename = get_data_filename(device_uuid, date_time) with open(data_filename, 'r') as f: rows = f.readlines() if len(rows) < config.prediction_data_size: return jsonify({ "status": 1, "message": "Not enough data! %d" % len(rows) }) raw = np.zeros((config.prediction_data_size, 3)) for i, j in zip(range(config.prediction_data_size), range(len(rows) - config.prediction_data_size, len(rows))): raw[i] = [int(val) for val in rows[j].strip().split(',')] norm = features.normalize(raw) temp_features = features.extract_multi_features(norm, step=config.step_size, x_len=config.sample_size) X = features.get_calibrated_features(temp_features, baseline['features']) """ json_ = request.json n_features = X.shape[1] feature_importance = np.zeros(n_features) if json_ and 'feature_importance' in json_: feature_importance[0] = json_['feature_importance']['flex'] feature_importance[1] = json_['feature_importance']['eda'] for i in range(2, n_features): feature_importance[i] = json_['feature_importance']['ecg'] / float(n_features - 2) else: feature_importance[0] = 1 / 3. feature_importance[1] = 1 / 3. for i in range(2, n_features): feature_importance[i] = 1 / float(3 * (n_features - 2)) y = clf.predict(X, feature_importance) """ y = clf.decision_function(X) - baseline['hboss_base'] return jsonify({ "status": 0, "sleep": list(y), "mean_sleep": np.mean(y), "max_sleep": np.max(y), "time": (time.time() - start_time) })
def train(): """ Train the predictor on the data collected """ start_time = time.time() device_uuid, date_time = request.args.get('deviceUUID'), request.args.get( 'datetime') data_filename = get_data_filename(device_uuid, date_time) with open(data_filename, 'r') as f: rows = f.readlines() with open(config.awake_filename, 'rb') as f: awake_features = pickle.load(f) if len(rows) < config.min_train_data_size: return jsonify({ "status": 1, "message": "Not enough training data! %d" % len(rows) }) raw = np.zeros((len(rows), 3)) for i in range(len(rows)): raw[i] = [int(val) for val in rows[i].strip().split(',')] norm = features.normalize(raw) temp_features = features.extract_multi_features(norm, step=config.step_size, x_len=config.sample_size) baseline_features = features.get_baseline_features(temp_features) norm_features = features.get_calibrated_features(temp_features, baseline_features) X = np.concatenate((awake_features, norm_features), axis=0) X[:, 1] = np.abs(np.random.normal(0, 0.01, len(X))) app.logger.info( 'Training classifier using %d feature sets, each containing %d features' % (X.shape[0], X.shape[1])) clf = HBOS(contamination=0.05) clf.fit(X) model_filename = get_model_filename(device_uuid, date_time) with open(model_filename, 'wb') as f: pickle.dump(clf, f) pred = clf.decision_function(X) baseline = {'features': baseline_features, 'hboss_base': np.min(pred)} baseline_filename = get_baseline_filename(device_uuid, date_time) with open(baseline_filename, 'wb') as f: pickle.dump(baseline, f) return jsonify({"status": 0, "time": (time.time() - start_time)})
train_path = 'data/train.tsv' submission_path = 'data/test.tsv' n = 10000 tags = pickle.load(open('cache/tagged.%s.pkl' % (n))) custom_contents = np.array(pickle.load(open('cache/custom_contents.%s.pkl' % (n)))) submission_custom_contents = np.array(pickle.load(open('cache/s.custom_contents.pkl'))) submission_tags = pickle.load(open('cache/s.tagged.pkl')) print 'Reading %s data' % (n) data = data_io.read_data(train_path, n) submission_data = data_io.read_data(submission_path, 10000) # use all contents = get_doc_contents(data[:, 2]) contents = [ features.normalize(content) for content in contents ] Y = data[:, -1].astype(int) bestwords = get_bestwords(contents, Y, 100000, n) submission_contents = get_doc_contents(submission_data[:, 2]) submission_contents = [ features.normalize(submission_content) for submission_content in submission_contents ] X_submission_ids = submission_data[:, 1] v = TfidfVectorizer(min_df = 2, binary = True, norm = 'l2', smooth_idf = True, sublinear_tf = True, strip_accents = 'unicode', vocabulary = bestwords, ngram_range = (1,2)) X = v.fit_transform(contents) X_submission = v.transform(submission_contents) del data del submission_data
def train(num_labels=None, gridcv=False, randomcv=False, kbest=None, rfecv=False): # Load the track library. Collect metadata labels. Generate a target # matrix. Load features for each track in the target matrix. libtracks = library.tracks() labels = collect_labels(libtracks, num_labels) tracklist, target = generate_target(labels) data = Dataset(features.normalize(features.matrix(tracklist)), target) feat_names = features.names() if kbest: reduce_kbest(data, feat_names, kbest) if rfecv: reduce_rfecv(data, feat_names) train, test = split_dataset(data, test_size=0.4, random_state=0) # A random forest should be able to handle the excessive dimensionality # of our dataset relative to the number of samples. clf = RandomForestClassifier(n_estimators=120, n_jobs=-1, verbose=1) if randomcv: print "random parameter search..." randomsearch( clf, train, 20, { "max_depth": [3, None], "max_features": scipy.stats.randint(50, 100), "min_samples_split": scipy.stats.randint(2, 11), "min_samples_leaf": scipy.stats.randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] }) if gridcv: print "grid parameter search..." gridsearch( clf, train, { "max_depth": [3, None], "max_features": [50, 75, 100], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"] }) print("training classifier...") clf.fit(*train) mean_importance = clf.feature_importances_.mean() # Measure prediction accuracy for the original training run. pred_target = clf.predict(test.input) orig_score = accuracy_score(test.target, pred_target) print("accuracy score with %d features: %.2f%%" % (len(feat_names), orig_score * 100.0)) # Reduce the feature set. print("selecting best features...") sfm = SelectFromModel(clf, threshold='1.5*mean') sfm.fit(*train) # Print the names of the most important features feature_subset = sfm.get_support(indices=True) for i in feature_subset: importance = clf.feature_importances_[i] / mean_importance print " %.1f: '%s'" % (importance, feat_names[i]) # make a new training set with just the useful features. print("preparing new training subset...") slim_train = transform_input(sfm, train) slim_test = transform_input(sfm, test) feat_names = [feat_names[i] for i in feature_subset] # train a new classifier using the reduced feature set. print("training subset classifier...") clf_slim = RandomForestClassifier(n_estimators=120, n_jobs=-1, verbose=1) clf_slim.fit(*slim_train) # measure accuracy of the retrained models pred_slim = clf_slim.predict(slim_test.input) slim_score = accuracy_score(slim_test.target, pred_slim) print("subset accuracy with %d features: %.2f%%" % (len(feature_subset), slim_score * 100.0))
category_ = get_category(label) if category_ is None: return None else: categories.add(category_) return sorted(list(categories)) # Convert the instrument labels, and keep only the segments whose instruments are all covered by the taxonomy for i, feature in enumerate(features_): categories = get_categories(labels_[i]) if categories is not None: features.append(feature) labels.append(categories) features = np.array(features) features = normalize(features) labels = np.array(labels) # Result summary sum_scores = dict() # (C, sigma) svm_params = (2, 1) selected_feats = { 'Bass drum': [('obsir', 3), ('obsir', 2), ('mfcc', 0), ('mfcc', 10), ('obsir', 4), ('obsir', 1), ('temporal_shape', 0), ('temporal_shape', 3), ('obsir', 5), ('mfcc', 12), ('spectral_shape', 1), ('obsir', 7), ('mfcc', 7), ('spread', 0), ('spectral_shape', 2), ('mfcc', 1)], 'Snare drum': [('obsir', 2), ('mfcc', 2), ('spectral_shape', 3), ('spread', 0), ('mfcc', 4), ('lpc', 3), ('lpc', 5), ('obsir', 3), ('flatness', 0), ('spectral_shape', 0), ('mfcc', 0), ('lpc', 0), ('temporal_shape', 2), ('obsir', 7), ('zcr', 0), ('obsir', 4)], 'Hi-hat': [('lpc', 0), ('temporal_shape', 2), ('mfcc', 4), ('obsir', 8), ('lpc', 5), ('zcr', 0), ('lpc', 1), ('obsir', 2), ('temporal_shape', 1), ('lpc', 3), ('lpc', 2), ('spectral_shape', 3), ('mfcc', 9), ('mfcc', 10), ('mfcc', 11), ('energy', 0)], } number_feats = {
if category_ is None: return None else: categories.add(category_) return sorted(list(categories)) # Convert the instrument labels, and keep only the segments whose instruments are all covered by the taxonomy for i, feature in enumerate(features_): categories = get_categories(labels_[i]) if categories is not None: features.append(feature) labels.append(categories) features = np.array(features) features = normalize(features) labels = np.array(labels) # Result summary sum_scores = dict() # (C, sigma) svm_params = (2, 1) selected_feats = { 'Bass drum': [('obsir', 3), ('obsir', 2), ('mfcc', 0), ('mfcc', 10), ('obsir', 4), ('obsir', 1), ('temporal_shape', 0), ('temporal_shape', 3), ('obsir', 5), ('mfcc', 12), ('spectral_shape', 1), ('obsir', 7), ('mfcc', 7), ('spread', 0), ('spectral_shape', 2), ('mfcc', 1)], 'Snare drum': [('obsir', 2), ('mfcc', 2), ('spectral_shape', 3),