Example #1
0
def data_normalize():
    for row in train_rows:
        row[3] = feas.normalize(row[3])
        #for i in range(len(row[3])):
        #    row[3][i] *= fea_weight[i]
    for row in test_rows:
        row[3] = feas.normalize(row[3])
Example #2
0
def data_normalize():
    for row in train_rows:
        row[3] = feas.normalize(row[3])
        #for i in range(len(row[3])):
        #    row[3][i] *= fea_weight[i]
    for row in test_rows:
        row[3] = feas.normalize(row[3])
Example #3
0
def predict():
    """ Predict sleep vs. non-sleep """
    start_time = time.time()

    device_uuid, date_time = request.args.get('deviceUUID'), request.args.get(
        'datetime')
    model_filename = get_model_filename(device_uuid, date_time)
    with open(model_filename, 'rb') as f:
        clf = pickle.load(f)

    baseline_filename = get_baseline_filename(device_uuid, date_time)
    with open(baseline_filename, 'rb') as f:
        baseline = pickle.load(f)

    data_filename = get_data_filename(device_uuid, date_time)
    with open(data_filename, 'r') as f:
        rows = f.readlines()
    if len(rows) < config.prediction_data_size:
        return jsonify({
            "status": 1,
            "message": "Not enough data! %d" % len(rows)
        })
    raw = np.zeros((config.prediction_data_size, 3))
    for i, j in zip(range(config.prediction_data_size),
                    range(len(rows) - config.prediction_data_size, len(rows))):
        raw[i] = [int(val) for val in rows[j].strip().split(',')]
    norm = features.normalize(raw)
    temp_features = features.extract_multi_features(norm,
                                                    step=config.step_size,
                                                    x_len=config.sample_size)
    X = features.get_calibrated_features(temp_features, baseline['features'])
    """
    json_ = request.json
    n_features = X.shape[1]
    feature_importance = np.zeros(n_features)
    if json_ and 'feature_importance' in json_:
        feature_importance[0] = json_['feature_importance']['flex']
        feature_importance[1] = json_['feature_importance']['eda']
        for i in range(2, n_features):
            feature_importance[i] = json_['feature_importance']['ecg'] / float(n_features - 2)
    else:
        feature_importance[0] = 1 / 3.
        feature_importance[1] = 1 / 3.
        for i in range(2, n_features):
            feature_importance[i] = 1 / float(3 * (n_features - 2))
    y = clf.predict(X, feature_importance)
    """

    y = clf.decision_function(X) - baseline['hboss_base']

    return jsonify({
        "status": 0,
        "sleep": list(y),
        "mean_sleep": np.mean(y),
        "max_sleep": np.max(y),
        "time": (time.time() - start_time)
    })
Example #4
0
def train():
    """ Train the predictor on the data collected """
    start_time = time.time()
    device_uuid, date_time = request.args.get('deviceUUID'), request.args.get(
        'datetime')
    data_filename = get_data_filename(device_uuid, date_time)

    with open(data_filename, 'r') as f:
        rows = f.readlines()
    with open(config.awake_filename, 'rb') as f:
        awake_features = pickle.load(f)
    if len(rows) < config.min_train_data_size:
        return jsonify({
            "status": 1,
            "message": "Not enough training data! %d" % len(rows)
        })
    raw = np.zeros((len(rows), 3))
    for i in range(len(rows)):
        raw[i] = [int(val) for val in rows[i].strip().split(',')]
    norm = features.normalize(raw)
    temp_features = features.extract_multi_features(norm,
                                                    step=config.step_size,
                                                    x_len=config.sample_size)
    baseline_features = features.get_baseline_features(temp_features)
    norm_features = features.get_calibrated_features(temp_features,
                                                     baseline_features)
    X = np.concatenate((awake_features, norm_features), axis=0)
    X[:, 1] = np.abs(np.random.normal(0, 0.01, len(X)))
    app.logger.info(
        'Training classifier using %d feature sets, each containing %d features'
        % (X.shape[0], X.shape[1]))
    clf = HBOS(contamination=0.05)
    clf.fit(X)

    model_filename = get_model_filename(device_uuid, date_time)
    with open(model_filename, 'wb') as f:
        pickle.dump(clf, f)

    pred = clf.decision_function(X)
    baseline = {'features': baseline_features, 'hboss_base': np.min(pred)}

    baseline_filename = get_baseline_filename(device_uuid, date_time)
    with open(baseline_filename, 'wb') as f:
        pickle.dump(baseline, f)

    return jsonify({"status": 0, "time": (time.time() - start_time)})
    train_path = 'data/train.tsv'
    submission_path = 'data/test.tsv'
    
    n = 10000
    tags = pickle.load(open('cache/tagged.%s.pkl' % (n)))
    custom_contents = np.array(pickle.load(open('cache/custom_contents.%s.pkl' % (n))))
    submission_custom_contents = np.array(pickle.load(open('cache/s.custom_contents.pkl')))
    submission_tags = pickle.load(open('cache/s.tagged.pkl'))
    
    print 'Reading %s data' % (n)
    data = data_io.read_data(train_path, n)
    submission_data = data_io.read_data(submission_path, 10000) # use all
    
    contents = get_doc_contents(data[:, 2])
    contents = [ features.normalize(content) for content in contents ]
    
    Y = data[:, -1].astype(int)
    
    bestwords = get_bestwords(contents, Y, 100000, n)
    
    submission_contents = get_doc_contents(submission_data[:, 2])
    submission_contents = [ features.normalize(submission_content) for submission_content in submission_contents ]
    X_submission_ids = submission_data[:, 1]
    
    v = TfidfVectorizer(min_df = 2, binary = True, norm = 'l2', smooth_idf = True, sublinear_tf = True, strip_accents = 'unicode', vocabulary = bestwords, ngram_range = (1,2))
    X = v.fit_transform(contents)
    X_submission = v.transform(submission_contents)
    
    del data
    del submission_data
Example #6
0
def train(num_labels=None,
          gridcv=False,
          randomcv=False,
          kbest=None,
          rfecv=False):
    # Load the track library. Collect metadata labels. Generate a target
    # matrix. Load features for each track in the target matrix.
    libtracks = library.tracks()
    labels = collect_labels(libtracks, num_labels)
    tracklist, target = generate_target(labels)
    data = Dataset(features.normalize(features.matrix(tracklist)), target)
    feat_names = features.names()

    if kbest:
        reduce_kbest(data, feat_names, kbest)

    if rfecv:
        reduce_rfecv(data, feat_names)

    train, test = split_dataset(data, test_size=0.4, random_state=0)
    # A random forest should be able to handle the excessive dimensionality
    # of our dataset relative to the number of samples.
    clf = RandomForestClassifier(n_estimators=120, n_jobs=-1, verbose=1)

    if randomcv:
        print "random parameter search..."
        randomsearch(
            clf, train, 20, {
                "max_depth": [3, None],
                "max_features": scipy.stats.randint(50, 100),
                "min_samples_split": scipy.stats.randint(2, 11),
                "min_samples_leaf": scipy.stats.randint(1, 11),
                "bootstrap": [True, False],
                "criterion": ["gini", "entropy"]
            })

    if gridcv:
        print "grid parameter search..."
        gridsearch(
            clf, train, {
                "max_depth": [3, None],
                "max_features": [50, 75, 100],
                "min_samples_split": [2, 3, 10],
                "min_samples_leaf": [1, 3, 10],
                "bootstrap": [True, False],
                "criterion": ["gini", "entropy"]
            })

    print("training classifier...")
    clf.fit(*train)
    mean_importance = clf.feature_importances_.mean()
    # Measure prediction accuracy for the original training run.
    pred_target = clf.predict(test.input)
    orig_score = accuracy_score(test.target, pred_target)
    print("accuracy score with %d features: %.2f%%" %
          (len(feat_names), orig_score * 100.0))

    # Reduce the feature set.
    print("selecting best features...")
    sfm = SelectFromModel(clf, threshold='1.5*mean')
    sfm.fit(*train)
    # Print the names of the most important features
    feature_subset = sfm.get_support(indices=True)
    for i in feature_subset:
        importance = clf.feature_importances_[i] / mean_importance
        print "    %.1f: '%s'" % (importance, feat_names[i])

    # make a new training set with just the useful features.
    print("preparing new training subset...")
    slim_train = transform_input(sfm, train)
    slim_test = transform_input(sfm, test)
    feat_names = [feat_names[i] for i in feature_subset]

    # train a new classifier using the reduced feature set.
    print("training subset classifier...")
    clf_slim = RandomForestClassifier(n_estimators=120, n_jobs=-1, verbose=1)
    clf_slim.fit(*slim_train)

    # measure accuracy of the retrained models
    pred_slim = clf_slim.predict(slim_test.input)
    slim_score = accuracy_score(slim_test.target, pred_slim)
    print("subset accuracy with %d features: %.2f%%" %
          (len(feature_subset), slim_score * 100.0))
        category_ = get_category(label)
        if category_ is None:
            return None
        else:
            categories.add(category_)
    return sorted(list(categories))

# Convert the instrument labels, and keep only the segments whose instruments are all covered by the taxonomy
for i, feature in enumerate(features_):
    categories = get_categories(labels_[i])
    if categories is not None:
        features.append(feature)
        labels.append(categories)

features = np.array(features)
features = normalize(features)
labels = np.array(labels)

# Result summary
sum_scores = dict()

# (C, sigma)
svm_params = (2, 1)

selected_feats = {
        'Bass drum': [('obsir', 3), ('obsir', 2), ('mfcc', 0), ('mfcc', 10), ('obsir', 4), ('obsir', 1), ('temporal_shape', 0), ('temporal_shape', 3), ('obsir', 5), ('mfcc', 12), ('spectral_shape', 1), ('obsir', 7), ('mfcc', 7), ('spread', 0), ('spectral_shape', 2), ('mfcc', 1)],
        'Snare drum': [('obsir', 2), ('mfcc', 2), ('spectral_shape', 3), ('spread', 0), ('mfcc', 4), ('lpc', 3), ('lpc', 5), ('obsir', 3), ('flatness', 0), ('spectral_shape', 0), ('mfcc', 0), ('lpc', 0), ('temporal_shape', 2), ('obsir', 7), ('zcr', 0), ('obsir', 4)],
        'Hi-hat': [('lpc', 0), ('temporal_shape', 2), ('mfcc', 4), ('obsir', 8), ('lpc', 5), ('zcr', 0), ('lpc', 1), ('obsir', 2), ('temporal_shape', 1), ('lpc', 3), ('lpc', 2), ('spectral_shape', 3), ('mfcc', 9), ('mfcc', 10), ('mfcc', 11), ('energy', 0)],
    }

number_feats = {
        if category_ is None:
            return None
        else:
            categories.add(category_)
    return sorted(list(categories))


# Convert the instrument labels, and keep only the segments whose instruments are all covered by the taxonomy
for i, feature in enumerate(features_):
    categories = get_categories(labels_[i])
    if categories is not None:
        features.append(feature)
        labels.append(categories)

features = np.array(features)
features = normalize(features)
labels = np.array(labels)

# Result summary
sum_scores = dict()

# (C, sigma)
svm_params = (2, 1)

selected_feats = {
    'Bass drum': [('obsir', 3), ('obsir', 2), ('mfcc', 0), ('mfcc', 10),
                  ('obsir', 4), ('obsir', 1), ('temporal_shape', 0),
                  ('temporal_shape', 3), ('obsir', 5), ('mfcc', 12),
                  ('spectral_shape', 1), ('obsir', 7), ('mfcc', 7),
                  ('spread', 0), ('spectral_shape', 2), ('mfcc', 1)],
    'Snare drum': [('obsir', 2), ('mfcc', 2), ('spectral_shape', 3),