def classifyingAndSavingRadiusModels(country, thermalDataQueryFields, dbData):

    # https://hackersandslackers.com/json-into-pandas-dataframes/
    # json_normalize has as default separator '.', since we have float numbers in our data, we set the column separator for the normalization to '_'
    data_df = pd.json_normalize(dbData, sep="_")
    print(data_df.describe().transpose())
    print(data_df.head())

    print("nr. of elements considered: ")
    print(len(data_df.index))

    # save dataframe to file for later use, for the api
    filenameRadiusDf = "serialized_radius_dataframe_" + country + ".pck"
    saveFile(country, filenameRadiusDf, data_df)

    # triming down the dataframe
    thermalDataColumn = thermalDataQueryFields.replace('.', '_')
    # forcing datatype due to memory issues
    # TODO: does this work
    slim_data_df = pd.DataFrame(
        data_df,
        columns=[
            'ratedDwelling_spatialData_totalFloorArea_value', thermalDataColumn
        ]).astype(np.dtype('uint32'))
    print(slim_data_df.head())

    # forcing datatype due to memory issues
    # labels_df = pd.DataFrame(
    # data_df, columns = ['awardedRating_ratingLevel']).astype(np.dtype('U', 1))
    labels_df = pd.DataFrame(data_df, columns=['awardedRating_ratingLevel'
                                               ])  # .astype(np.dtype('U', 1))
    print(labels_df.head())

    X = slim_data_df
    y = labels_df.values.ravel()  # collapse array into one dimension

    # split the data in test and train data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    print("split done")

    # https://stackoverflow.com/questions/43162506/undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi
    #y_test = np.array(y_test)
    #y_train = np.array(y_train)

    # fitting the data is quite important
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # saving scaler for later use
    scalerFilename = "serialized_radius_scaler_" + country + ".pck"
    saveFile(country, scalerFilename, scaler)

    # the radius neighbors
    # https: // scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html

    # class sklearn.neighbors.RadiusNeighborsClassifier(
    # radius=1.0, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski',
    # outlier_label=None, metric_params=None, n_jobs=None, **kwargs)

    # radius=5.0
    # classifier_radius = RadiusNeighborsClassifier(
    #    radius=5, metric='euclidean', weights='distance')
    # radius=1.0
    classifier_radius = RadiusNeighborsClassifier(radius=5,
                                                  metric='euclidean',
                                                  weights='distance',
                                                  outlier_label='Z')
    # A classifier cannot classify the samples of a class if some samples of the class aren't present in the training set
    classifier_radius.set_params(outlier_label='Z')
    classifier_radius.fit(X_train, y_train)
    y_pred_radius = classifier_radius.predict(X_test)

    print("radius prediction")
    print(y_pred_radius)

    print("Accuracy radius classifier")
    print(confusion_matrix(y_test, y_pred_radius))
    # y_test ist the ground truth
    print(classification_report(y_test, y_pred_radius))

    # joblib - save model to file
    filenameRadiusClassifier = "serialized_radius_classifier_" + country + ".pck"
    saveFile(country, filenameRadiusClassifier, classifier_radius)
    return "finished"
Ejemplo n.º 2
0
#    here, 'persisted_models', country, filename)
#y_train = joblib.load(filepath)

#filename = "serialized_y_test_" + country + ".pck"
# filepath = os.path.join(
#    here, 'persisted_models', country, filename)
#y_test = joblib.load(filepath)

#print("loading data finished")

# the radius neighbors
# https: // scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html

classifier_radius = RadiusNeighborsClassifier(
    radius=5, metric='euclidean', weights='distance')
classifier_radius.set_params(outlier_label='Z')
classifier_radius.fit(X_train, y_train)
y_pred_radius = classifier_radius.predict(X_test)

print("radius prediction")
print(y_pred_radius)

print("Accuracy radius classifier")
print(confusion_matrix(y_test, y_pred_radius))
print(classification_report(y_test, y_pred_radius))

# joblib - save model to file
filenameRadiusClassifier = "serialized_radius_classifier_" + country + ".pck"

here = os.path.dirname(os.path.abspath(__file__))
filepathRadiusClassifier = os.path.join(