def tags_report(feats, num=15): # compute a score for each tag and pick the ones that meet some threshold. tags = library.tags() meansquare = sum(len(v)**2 for v in tags.itervalues()) / len(tags) significance = int(meansquare**0.5) tags = [(k, v) for k, v in tags.iteritems() if len(v) > significance] # compute the mean and standard deviation for each feature. # for each tag, compute the mean for each track associated with that tag. # select features whose tag mean is more distant from the library mean # than the standard deviation. lib_mean = feats.mean(axis=0) lib_std = feats.std(axis=0) threshold = lib_std * 1.5 # get the index for each track track_map = dict() for i, t in enumerate(library.tracks()): track_map[t.hash] = i # for each tag, make a mask with the indexes of its tracks names = features.names() for tag, vals in tags: print("tag %s is associated with %d tracks" % (tag, len(vals))) indexes = np.array([track_map[t.hash] for t in vals]) tag_mean = feats[indexes, :].mean(axis=0) outliers = np.argwhere(np.absolute(tag_mean - lib_mean) > threshold) for i in outliers[..., 0]: print(" %s local mean=%.2f; library mean=%.2f" % (names[i], tag_mean[i], lib_mean[i]))
def correlation_report(feats, num=20): R = np.corrcoef(feats, rowvar=False) fig = plt.figure(1, figsize=(1280 / 64, 1280 / 64), dpi=96) plt.matshow(R) plt.gca().set_aspect(1.) plt.gca().axis('off') plt.savefig("correlation.png", dpi=96, bbox_inches='tight') # we only need half of this matrix, because it is symmetrical R = np.triu(R, k=1) # we only care about magnitude of correlation, not direction flatR = R.ravel() np.absolute(flatR, out=flatR, where=np.isfinite(flatR)) ordering = np.argsort(flatR) ordering = np.compress(np.isfinite(flatR[ordering]), ordering) names = features.names() print("top %d most highly correlated variables" % num) for flat in ordering[::-1][:num]: pair = np.unravel_index(flat, R.shape) coeff = R[pair] print(" %s . %s: %s" % (names[pair[0]], names[pair[1]], ns(coeff))) print("bottom %d least highly correlated variables" % num) for flat in ordering[:num]: pair = np.unravel_index(flat, R.shape) coeff = R[pair] print(" %s . %s: %s" % (names[pair[0]], names[pair[1]], ns(coeff)))
def mean_stdev_limits_report(feats, *args, **kwargs): print("mean, stdev, and limits for each feature") names = features.names() for i in np.arange(feats.shape[-1]): feat = feats[:, i] minv, maxv = feat.min(), feat.max() meanv, stdv = feat.mean(), feat.std() print("%s: (%s .. %s); mean=%s, stdev=%s " % (names[i], ns(minv), ns(maxv), ns(meanv), ns(stdv)))
def normaltest_report(feats, num=20): # to what degree does each feature represent a normal distribution? numfeats = feats.shape[-1] statistic = np.zeros(numfeats) pvalue = np.zeros(numfeats) for i in np.arange(numfeats): s, p = scipy.stats.normaltest(feats[:, i]) statistic[i] = s pvalue[i] = p print(" %s s=%s, p=%s" % (features.names()[i], ns(s), ns(p)))
def kurtosis_report(feats, num=20): # which are the most and the least gaussian features present? mean = feats.mean(axis=0) var = feats.var(axis=0) diffmean = feats - mean indexes = np.arange(feats.shape[-1]) usable = (mean != 0) & (var != 0) mean = np.compress(usable, mean) var = np.compress(usable, var) diffmean = np.compress(usable, diffmean) indexes = np.compress(usable, indexes) kurt = (1. / feats.shape[-1]) * np.sum(diffmean**4) / (var**2) - 3.0 ordering = np.argsort(kurt) print("top %d most gaussian features" % num) names = features.names() for i in ordering[::-1][:num]: print(" %s (%s)" % (names[indexes[i]], ns(kurt[i]))) print("bottom %d least gaussian features" % num) for i in ordering[:num]: print(" %s (%s)" % (names[indexes[i]], ns(kurt[i])))
def scaled_mean_stdev_report(feats, *args, **kwargs): print("mean, stdev for each feature after minmax and power scaling") names = features.names() # scale the limits so that all values fall within 0..1 for each feature scaled = feats.copy() scaled -= scaled.min(axis=0) maxv = scaled.max(axis=0) scaled[:, maxv.nonzero()] /= maxv[maxv.nonzero()] # compute the linear average, then get the logarithm in that base of the # value 0.5. We will correct for distribution nonlinearity by raising every # scaled value to this power. meanv = scaled.mean(axis=0) powers = np.ones_like(meanv) powers[meanv.nonzero()] = np.log(0.5) / np.log(meanv[meanv.nonzero()]) curved = scaled**powers # print out a little report of what we found for i in np.arange(feats.shape[-1]): lmean, lstd = curved[:, i].mean(), curved[:, i].std() print("%s: %s**%s = %s, dev=%s" % (names[i], ns(meanv[i]), ns(powers[i]), ns(lmean), ns(lstd))) # Plot the scaled and curved feature matrices. figsize = (feats.shape[0] / 96, 2 * feats.shape[1] / 96) fig, axes = plt.subplots(nrows=1, ncols=2, figsize=figsize) axes[0].matshow(scaled, cmap='gray') axes[0].axis('off') axes[0].set_aspect(1.0) axes[1].matshow(curved, cmap='gray') axes[1].axis('off') axes[1].set_aspect(1.0) plt.savefig("scaled_featmatrix.png", dpi=96, bbox_inches='tight') # Plot histograms of the scaled and curved features. hist_bins = 16 figsize = (4, feats.shape[1] / 96) fig, axes = plt.subplots(nrows=1, ncols=2, figsize=figsize) histogram = np.zeros((feats.shape[1], hist_bins), dtype=np.float) for i in np.arange(feats.shape[1]): hist, edges = np.histogram(scaled[:, i], bins=hist_bins, range=(0, 1), density=True) histogram[i] = hist / hist.max() histogram = np.repeat(histogram, 128 / hist_bins, axis=1) histogram = np.pad(histogram, (8, 8), 'constant', constant_values=(0.5, 0.5)) axes[0].matshow(histogram, cmap='gray') axes[0].axis('off') axes[0].set_aspect(1.0) histogram = np.zeros((feats.shape[1], hist_bins), dtype=np.float) for i in np.arange(feats.shape[1]): hist, edges = np.histogram(curved[:, i], bins=hist_bins, range=(0, 1), density=True) histogram[i] = hist / hist.max() histogram = np.repeat(histogram, 128 / hist_bins, axis=1) histogram = np.pad(histogram, (8, 8), 'constant', constant_values=(0.5, 0.5)) axes[1].matshow(histogram, cmap='gray') axes[1].axis('off') axes[1].set_aspect(1.0) plt.savefig("scaled_featdist.png", dpi=96, bbox_inches='tight')
def print_feat(i, feats): name = features.names()[i] avg = feats[:, i].mean() dev = feats[:, i].std() scale = (dev / np.abs(avg)) * 100.0 print(" %s (%s . %s, %.2f%%)" % (name, ns(avg), ns(dev), scale))
def train(num_labels=None, gridcv=False, randomcv=False, kbest=None, rfecv=False): # Load the track library. Collect metadata labels. Generate a target # matrix. Load features for each track in the target matrix. libtracks = library.tracks() labels = collect_labels(libtracks, num_labels) tracklist, target = generate_target(labels) data = Dataset(features.normalize(features.matrix(tracklist)), target) feat_names = features.names() if kbest: reduce_kbest(data, feat_names, kbest) if rfecv: reduce_rfecv(data, feat_names) train, test = split_dataset(data, test_size=0.4, random_state=0) # A random forest should be able to handle the excessive dimensionality # of our dataset relative to the number of samples. clf = RandomForestClassifier(n_estimators=120, n_jobs=-1, verbose=1) if randomcv: print "random parameter search..." randomsearch( clf, train, 20, { "max_depth": [3, None], "max_features": scipy.stats.randint(50, 100), "min_samples_split": scipy.stats.randint(2, 11), "min_samples_leaf": scipy.stats.randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] }) if gridcv: print "grid parameter search..." gridsearch( clf, train, { "max_depth": [3, None], "max_features": [50, 75, 100], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"] }) print("training classifier...") clf.fit(*train) mean_importance = clf.feature_importances_.mean() # Measure prediction accuracy for the original training run. pred_target = clf.predict(test.input) orig_score = accuracy_score(test.target, pred_target) print("accuracy score with %d features: %.2f%%" % (len(feat_names), orig_score * 100.0)) # Reduce the feature set. print("selecting best features...") sfm = SelectFromModel(clf, threshold='1.5*mean') sfm.fit(*train) # Print the names of the most important features feature_subset = sfm.get_support(indices=True) for i in feature_subset: importance = clf.feature_importances_[i] / mean_importance print " %.1f: '%s'" % (importance, feat_names[i]) # make a new training set with just the useful features. print("preparing new training subset...") slim_train = transform_input(sfm, train) slim_test = transform_input(sfm, test) feat_names = [feat_names[i] for i in feature_subset] # train a new classifier using the reduced feature set. print("training subset classifier...") clf_slim = RandomForestClassifier(n_estimators=120, n_jobs=-1, verbose=1) clf_slim.fit(*slim_train) # measure accuracy of the retrained models pred_slim = clf_slim.predict(slim_test.input) slim_score = accuracy_score(slim_test.target, pred_slim) print("subset accuracy with %d features: %.2f%%" % (len(feature_subset), slim_score * 100.0))