Exemple #1
0
def runCrossValidation(runSMOTE = True, runIQR = True):
    datasetFile = 'data/source-code-metrics_train.csv'
    labelsFile = 'data/bugs_train.csv'
    data = pd.read_csv(datasetFile, ';') #separate at semicolon instead of comma
    labels = pd.read_csv(labelsFile, ';')
    data.set_index('classid',inplace=True)
    labels.set_index('classid',inplace=True)
    
    if runSMOTE:
        """
        Section: SMOTE for class balance
        """ 
        from unbalanced_dataset import SMOTE #, TomekLinks
    
        columns = list(data)
        smote = SMOTE(ratio=3, verbose=False, kind='regular')
        smox,smoy = smote.fit_transform(data.as_matrix(),labels.as_matrix().ravel())
        data = pd.DataFrame(smox, columns=columns)
        labels = pd.DataFrame(smoy, columns=['bugs'])

    if runIQR:
        """
        Section: outlier detection
        """
        from myOutlierDetection import interquantileRange
        interquantileRange(data, perFeature = False)
    
    return crossvalidate(data.as_matrix(), labels.as_matrix().ravel())
Exemple #2
0
def averageTrainTest():
    datasetFile = 'data/source-code-metrics_train.csv'
    labelsFile = 'data/bugs_train.csv'
    data = pd.read_csv(datasetFile, ';') #separate at semicolon instead of comma
    labels = pd.read_csv(labelsFile, ';')
    data.set_index('classid',inplace=True)
    labels.set_index('classid',inplace=True)
    
    """
    Section: SMOTE for class balance
    """ 
    from unbalanced_dataset import SMOTE #, TomekLinks

    columns = list(data)
    smote = SMOTE(ratio=3, verbose=False, kind='regular')
    smox,smoy = smote.fit_transform(data.as_matrix(),labels.as_matrix().ravel())
    data = pd.DataFrame(smox, columns=columns)
    labels = pd.DataFrame(smoy, columns=['bugs'])

    
    """
    Section: outlier detection
    """
    from myOutlierDetection import interquantileRange
    interquantileRange(data, perFeature = False)
    
    data = [trainandtest(data, labels) for _ in range(500)]
    return (sum([data[i][0] for i in range(len(data))])/len(data),sum([data[i][1] for i in range(len(data))])/len(data))