def testWouldTake(predictor): X, Y, W, headings = data.loadFeaturesAndLabels(data.DV_Type.WOULD_TAKE) # remove all where would take ambiguous print('removing ambiguous would take ratings') threshold = 8 deleted = 0 yes = 0 no = 0 for i, row in enumerate(Y): index = i - deleted if row[0] >= threshold and row[0] < 20 - threshold: X = numpy.delete(X, index, 0) Y = numpy.delete(Y, index, 0) deleted += 1 elif row[0] < threshold: Y[index, 0] = 0 no += 1 elif row[0] >= 20 - threshold: Y[index, 0] = 1 yes += 1 print('removed - ambiguous:', deleted) print('remaining:', X.shape[0], 'yes:', yes / X.shape[0], 'no:', no / X.shape[0]) #testModel(X, Y, None, headings, model_wouldtake_svm, predictWouldTakeSVM, True) testModel(X, Y, None, headings, model_wouldtake_svm, predictor, True, None)
def testModelEffort(model): X, Y, W, headings = data.loadFeaturesAndLabels(data.DV_Type.EFFORT_TRIMMED) # calculate weights from standard deviation weights = numpy.zeros((W.shape[0], 1), dtype=float) for i, sd in enumerate(W): weights[i, 0] = 1 - (sd / 30) if model is None: model = headings testModel(X, Y, weights, headings, model, predictEffort, False, W)
def calcFeatureTable(dv_type): features, labels, weights, headings = data.loadFeaturesAndLabels(dv_type) # normalize features for i in range(0, features.shape[1]): features[:, i:i + 1] = preprocessing.normalize(features[:, i:i + 1], axis=0) # add feature names to table table = [['Features']] for heading in headings: table.append([heading]) methods = [['Lin. Cor.', lambda X, Y: pearson(X, Y)], ['Lin. Reg.', lambda X, Y: linearRegression(X, Y, 0)], ['Lasso', lambda X, Y: linearRegression(X, Y, 1)], ['Ridge', lambda X, Y: linearRegression(X, Y, 2)], ['MIC', lambda X, Y: MIC(X, Y)], ['Stability', lambda X, Y: stability(X, Y)], ['Random Forest', lambda X, Y: randomForest(X, Y)]] for method in methods: print('applying', method[0]) coefs = method[1](features, labels) # format coefs: some methods output scores in nested row vals = coefs try: if coefs.shape[1] == len(features[0]): vals = coefs[0] except: True # append method to headings table[0].append(method[0]) # append features scores to table for i, c in enumerate(vals): if math.isnan(c): c = 0 table[1 + i].append(math.floor(c * 1000) / 1000) return table
def RFE_effortRandomForest(numFeatures, formated=False): X, Y, W, headings = data.loadFeaturesAndLabels(data.DV_Type.EFFORT_TRIMMED) Y = Y.flatten() # normalize features column wise for i in range(0, X.shape[1]): X[:, i:i + 1] = preprocessing.normalize(X[:, i:i + 1], axis=0) clf = RandomForestRegressor() # rank all features, i.e continue the elimination until the last one rfe = RFE(clf, n_features_to_select=10) rfe.fit(X, Y) for i, val in enumerate( sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), headings))): if formated: if i % 10 == 0: print(i) print(" '" + val[1] + "',") else: print(i, val)
def RandomFeatureRanking(use_zweights=False): dv = data.DV_Type.WORKLOAD_EFFORT X, Y, W, headings = data.loadFeaturesAndLabels(dv) # normalize features column wise for i in range(0, X.shape[1]): X[:, i:i + 1] = preprocessing.normalize(X[:, i:i + 1], axis=0) # calculate weights from standard deviation weights = numpy.zeros((W.shape[0], 1), dtype=float) for i, sd in enumerate(W): weights[i, 0] = 1 - (sd / 30) randoms = numpy.zeros((1000, 4), dtype=float) for r in range(0, randoms.shape[0]): print("progress:", r / randoms.shape[0]) test = numpy.arange(X.shape[1]) numpy.random.shuffle(test) test = test[:randoms.shape[1] - 1] selected = [] for t, i in enumerate(test): selected.append(headings[i]) # filter out features based on prediction model X_, headings_ = selectFeatures(X, headings, selected) numIterations = 10 meanMeanError = 0 for i in range(0, numIterations): # use indicies to split matricies randomly but in order split = 0.8 X1, X2, indicies = data.splitMatrixRandomly(X_, split) Y1, Y2, indicies = data.splitMatrixRandomly(Y, split, indicies) w1, w2, indicies = data.splitMatrixRandomly( weights, split, indicies) # flatten data Y1 = Y1.flatten() Y2 = Y2.flatten() w1 = w1.flatten() w2 = w2.flatten() rf = RandomForestRegressor() rf.fit(X1, Y1) pred = rf.predict(X2) diff = 0 for p, val in enumerate(pred): diff += abs(val - Y2[p]) #print(diff / Y2.shape[0]) meanMeanError += diff / Y2.shape[0] if False: plt.xlim(0, max(Y2)) plt.ylim(0, max(Y2)) plt.plot(Y2, Y2, 'bo') plt.plot(Y2, pred, 'ro') plt.ylabel(str(dv)) plt.show() for i in range(0, randoms.shape[1]): randoms[r, 0] = meanMeanError / numIterations randoms[r, 1:] = test randoms = randoms[numpy.argsort(randoms[:, 0])] test = [] for r, row in enumerate(randoms): if len(test) > 10: break for l, val in enumerate(row): if l > 0: test.append(val) # insert headings + avoid doubles selected = [] for t, i in enumerate(test): h = headings[int(i)] try: selected.index(h) except: selected.append(h) print("'" + h + "',") print("used top features #:", len(selected)) # filter out features based on prediction model X_, headings_ = selectFeatures(X, headings, selected) numIterations = 10 meanMeanError = 0 for i in range(0, numIterations): # use indicies to split matricies randomly but in order split = 0.6 X1, X2, indicies = data.splitMatrixRandomly(X_, split) Y1, Y2, indicies = data.splitMatrixRandomly(Y, split, indicies) w1, w2, indicies = data.splitMatrixRandomly(weights, split, indicies) # flatten data Y1 = Y1.flatten() Y2 = Y2.flatten() w1 = w1.flatten() w2 = w2.flatten() rf = RandomForestRegressor() rf.fit(X1, Y1, w1) pred = rf.predict(X2) diff = 0 for p, val in enumerate(pred): diff += abs(val - Y2[p]) meanMeanError += diff / Y2.shape[0] print(meanMeanError / numIterations) plt.xlim(0, max(Y2)) plt.ylim(0, max(Y2)) plt.plot(Y2, Y2, 'bo') plt.plot(Y2, pred, 'ro') plt.ylabel(str(dv))