def runCrossValidation(runSMOTE = True, runIQR = True): datasetFile = 'data/source-code-metrics_train.csv' labelsFile = 'data/bugs_train.csv' data = pd.read_csv(datasetFile, ';') #separate at semicolon instead of comma labels = pd.read_csv(labelsFile, ';') data.set_index('classid',inplace=True) labels.set_index('classid',inplace=True) if runSMOTE: """ Section: SMOTE for class balance """ from unbalanced_dataset import SMOTE #, TomekLinks columns = list(data) smote = SMOTE(ratio=3, verbose=False, kind='regular') smox,smoy = smote.fit_transform(data.as_matrix(),labels.as_matrix().ravel()) data = pd.DataFrame(smox, columns=columns) labels = pd.DataFrame(smoy, columns=['bugs']) if runIQR: """ Section: outlier detection """ from myOutlierDetection import interquantileRange interquantileRange(data, perFeature = False) return crossvalidate(data.as_matrix(), labels.as_matrix().ravel())
def averageTrainTest(): datasetFile = 'data/source-code-metrics_train.csv' labelsFile = 'data/bugs_train.csv' data = pd.read_csv(datasetFile, ';') #separate at semicolon instead of comma labels = pd.read_csv(labelsFile, ';') data.set_index('classid',inplace=True) labels.set_index('classid',inplace=True) """ Section: SMOTE for class balance """ from unbalanced_dataset import SMOTE #, TomekLinks columns = list(data) smote = SMOTE(ratio=3, verbose=False, kind='regular') smox,smoy = smote.fit_transform(data.as_matrix(),labels.as_matrix().ravel()) data = pd.DataFrame(smox, columns=columns) labels = pd.DataFrame(smoy, columns=['bugs']) """ Section: outlier detection """ from myOutlierDetection import interquantileRange interquantileRange(data, perFeature = False) data = [trainandtest(data, labels) for _ in range(500)] return (sum([data[i][0] for i in range(len(data))])/len(data),sum([data[i][1] for i in range(len(data))])/len(data))