Esempio n. 1
0
def main():
    create_submission = True
    train_regression = False
    plot_roc = True

    print 'Reading data...',
    mosquitos_train_data, weather_data, spray_data = read_mosquitos_data('../input/')
    print 'Done'

    mosquitos_train_data_dropped = compact_train_data(mosquitos_train_data)

    print 'Preprocessing weather data...',
    weather_data = pre_process_weather(weather_data)
    print 'Done'

    # Construct feature vector from the train data:
    if train_regression:
        print 'Training mosquitoes predictor regressor...' + bcolors.WARNING
        train_regressor()
        print bcolors.ENDC + 'Done'
    else:
        print bcolors.OKBLUE + 'Skipped training regressor (assuming already done in the past)' + bcolors.ENDC
    print 'Constructing feature vectors...',
    features = create_feature_vector(mosquitos_train_data_dropped, weather_data, spray_data, verbose=1)
    labels = get_labels(mosquitos_train_data_dropped)
    print 'Done'
    # Train data:
    num_of_folds = 4
    #kfold = StratifiedKFold(labels, n_folds=num_of_folds, shuffle=True)
    kfold = get_folds(mosquitos_train_data_dropped)
    estimator = create_pipeline()
    param_grid = {'n_estimators': [500, 750, 1200, 2000],
                  'min_samples_split': [30, 40, 50]}
    # param_grid = {'n_estimators': [50, 100, 250, 500, 1000], 'max_depth': [6, 4], 'learning_rate': [0.01, 0.05, 0.1, 0.5], 'max_features': [1.0, 0.5]}
    # param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 1, 10]}
    # features = scale(features)
    # true_ratio = np.sum(labels == 1)/float(len(labels))
    # false_ratio = np.sum(labels == 0)/float(len(labels))
    # samples_weights = np.zeros(len(labels))
    # samples_weights[labels == 1] = false_ratio
    # samples_weights[labels == 0] = true_ratio

    grid_search_cv = GridSearchCV(estimator, param_grid, scoring='roc_auc', n_jobs=8, cv=kfold, iid=False, verbose=1)  #,
                                  # fit_params={'sample_weight': samples_weights}) #todo: decide about iid parameter...
    print 'Training on train data...',
    print bcolors.WARNING
    grid_search_cv.fit(features, labels)
    print bcolors.ENDC
    print 'Done'
    print bcolors.HEADER + bcolors.UNDERLINE + '\nClassifier scores:' + bcolors.ENDC
    print_cv_conclusions(grid_search_cv, features, labels)

    estimator = grid_search_cv.best_estimator_

    if plot_roc:
        for i, (train, test) in enumerate(kfold):
            print 'Fitting fold number', i
            probas = estimator.fit(features[train], labels[train]).predict_proba(features[test])
            # Compute ROC curve and area the curve
            fpr, tpr, thresholds = roc_curve(labels[test], probas[:, 1])
            # mean_tpr += interp(mean_fpr, fpr, tpr)
            # mean_tpr[0] = 0.0
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

        # mean_tpr /= len(cv)
        # mean_tpr[-1] = 1.0
        # mean_auc = auc(mean_fpr, mean_tpr)
        # plt.plot(mean_fpr, mean_tpr, 'k--',
        #   #        label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.show()

        print 'Refitting model to the entire train data'
        estimator.fit(features, labels)  # refit estimator to the entire data set

    if create_submission:
        print bcolors.HEADER + bcolors.UNDERLINE + '\nCreating submission file' + bcolors.ENDC
        mosquitoes_test_data = pd.read_csv('../input/test.csv')
        relevant_mosquitoes = (mosquitoes_test_data['Species'] != 'CULEX PIPIENS') & \
                              (mosquitoes_test_data['Species'] != 'CULEX RESTUANS') & \
                              (mosquitoes_test_data['Species'] != 'CULEX PIPIENS/RESTUANS')
        relevant_mosquitoes = np.nonzero(relevant_mosquitoes)[0]
        print 'Extracting test features...',
        test_features = create_feature_vector(mosquitoes_test_data, weather_data, spray_data)
        print 'Done'
        print 'Predicting probabilities...'
        end_pointer = 0
        start_pointer = 0
        m = test_features.shape[0]
        output_dataframe = pd.DataFrame(columns=['Id', 'WnvPresent'])
        output_dataframe.to_csv('../output/out.csv', index=False)
        while end_pointer < m:
            end_pointer = min(m, start_pointer + 10000)
            probabilities = np.array(estimator.predict_proba(test_features[start_pointer:end_pointer, :]))[:, 1]

            # Force zero probability for rare mosquitoes types
            indexes = relevant_mosquitoes[(relevant_mosquitoes >= start_pointer) & (relevant_mosquitoes < end_pointer)]
            probabilities[indexes - start_pointer] = 0

            ids = np.arange(start_pointer+1, end_pointer+1)
            output_dataframe = pd.DataFrame(np.column_stack((ids, probabilities)), columns=['Id', 'WnvPresent'])
            output_dataframe[['Id']] = output_dataframe[['Id']].astype(int)
            start_pointer = end_pointer

            # Write to file:
            output_dataframe.to_csv('../output/out.csv', index=False, mode='a', header=False)
            print 'Finished ', end_pointer, ' items out of ', m
        print bcolors.BOLD + bcolors.OKBLUE + 'Submission file ready :)' + bcolors.ENDC
def separated_gaussian_model():
    # Arranging the data.
    # todo: write this in a prettier way (using groupby etc.) an put it in a function
    mosquitoes_data, weather_data, spray_data = read_mosquitos_data("../input/")
    weather_data = pre_process_weather(weather_data)

    # Extract temporal data:
    dates = np.array([x.split("-") for x in mosquitoes_data["Date"]])
    years = np.array(dates[:, 0], dtype=np.int)
    unique_years = np.unique(years)
    months = np.array(dates[:, 1], dtype=np.int)
    days = np.array(dates[:, 2], dtype=np.int)
    days_in_month = 31 * np.ones(months.shape, dtype=np.int)
    days_in_month[(months == 6) + (months == 9)] = 30
    week_num = (days + days_in_month * (months - 5)) / 7
    detailed_temporal_data = mosquitoes_data[["NumMosquitos", "WnvPresent"]]
    detailed_temporal_data["WeekNum"] = pd.Series(week_num, index=mosquitoes_data.index)
    detailed_temporal_data["Year"] = pd.Series(years, index=mosquitoes_data.index)
    temporal_data = detailed_temporal_data.groupby(["Year", "WeekNum"], as_index=False).sum()

    # Extract weather data:
    weather_dates = np.array([x.split("-") for x in weather_data["Date"]])
    weather_years = np.array(weather_dates[:, 0], dtype=np.int)
    weather_months = np.array(weather_dates[:, 1], dtype=np.int)
    weather_days = np.array(weather_dates[:, 2], dtype=np.int)
    weather_days_in_month = 31 * np.ones(weather_months.shape, dtype=np.int)
    weather_days_in_month[(weather_months == 6) + (weather_months == 9)] = 30
    weather_week_num = (weather_days + weather_days_in_month * (weather_months - 5)) / 7
    weather_data["WeekNum"] = pd.Series(weather_week_num, index=weather_data.index)
    weather_data["Year"] = pd.Series(weather_years, index=weather_data.index)
    relevant_weather_data = weather_data.groupby(["Year", "WeekNum"], as_index=False).sum()
    relevant_weather_data["Station"] /= 3
    relevant_weather_data["Station"] *= 2
    relevant_weather_data[["Tmax", "Tmin", "Depart", "DewPoint", "WetBulb", "PrecipTotal"]] = relevant_weather_data[
        ["Tmax", "Tmin", "Depart", "DewPoint", "WetBulb", "PrecipTotal"]
    ].div(relevant_weather_data["Station"], axis="index")
    temporal_data[["Tmax", "Tmin", "Depart", "DewPoint", "WetBulb", "PrecipTotal"]] = relevant_weather_data[
        ["Tmax", "Tmin", "Depart", "DewPoint", "WetBulb", "PrecipTotal"]
    ]
    # print temporal_data
    # print temporal_data[['NumMosquitos', 'PrecipTotal']]

    plt.figure()
    for i, y in enumerate(unique_years):
        plt.title("Num of mosquitoes and total percip")
        plt.subplot(2, 4, i + 1)
        year = np.nonzero(np.array(temporal_data["Year"] == y))[0]
        plt.plot(temporal_data["WeekNum"][year], temporal_data["NumMosquitos"][year])
        plt.ylim([0, 12000])
        plt.xlim([0, 25])
        plt.subplot(2, 4, i + 5)
        plt.plot(temporal_data["WeekNum"][year], temporal_data["PrecipTotal"][year])
        plt.ylim([0, 1])
        plt.xlim([0, 25])
    # plt.show()

    # Extract geo data:
    mosquitoes_data["Trap"] = np.array([x[1:4] for x in mosquitoes_data["Trap"]], dtype=np.float)
    mosquitoes_data["Year"] = pd.Series(years, index=mosquitoes_data.index)
    mosquitoes_yearly_sum = (
        mosquitoes_data[["NumMosquitos", "WnvPresent", "Year"]].groupby(["Year"], as_index=False).sum()
    )
    sum_column = None
    for i, year in enumerate(unique_years):
        if sum_column is None:
            sum_column = (mosquitoes_data["Year"] == year) * mosquitoes_yearly_sum["NumMosquitos"][i]
        else:
            sum_column += (mosquitoes_data["Year"] == year) * mosquitoes_yearly_sum["NumMosquitos"][i]
    mosquitoes_data["NumMosquitos"] = mosquitoes_data["NumMosquitos"].div(sum_column)
    spatial_data = (
        mosquitoes_data[["Trap", "Longitude", "Latitude", "NumMosquitos", "WnvPresent"]]
        .groupby(["Trap", "Longitude", "Latitude"], as_index=False)
        .sum()
    )

    plt.figure()
    plt.title("Num mosquitoes VS WNV Presence")
    num_mosquitoes = np.array(spatial_data[["NumMosquitos", "WnvPresent"]])
    plt.plot(num_mosquitoes[:, 0])
    plt.plot(num_mosquitoes[:, 1] / np.max(num_mosquitoes[:, 1]), color="r")
    # plt.show()

    # Fit temporal data
    x = np.array(temporal_data[["WeekNum", "Tmax", "Tmin", "Depart", "DewPoint", "WetBulb", "PrecipTotal"]])
    x = PolynomialFeatures(2).fit_transform(x)
    y = np.array(temporal_data["NumMosquitos"])
    temporal_regressor = RidgeCV(alphas=np.array([0.01, 0.05, 0.1, 0.5, 1, 5, 10]), scoring="mean_absolute_error")
    temporal_regressor.fit(x, y)
    print temporal_regressor.alpha_
    print temporal_regressor.coef_
    print len(temporal_regressor.coef_)
    predictions = temporal_regressor.predict(x)
    plt.figure()
    plt.plot(y)
    plt.plot(predictions, color="r")

    with open(
        "../output/mosquitoes_count_temporal_regression.pickle", "wb"
    ) as out_file:  # This is the generalized regressor
        pickle.dump(temporal_regressor, out_file)

    # Fit spatial data
    # Init Params

    longitudes = np.array(spatial_data["Longitude"])
    latitudes = np.array(spatial_data["Latitude"])
    x = compress_features(longitudes, latitudes)
    y = np.array(spatial_data["NumMosquitos"])
    longitudes_frame = (-88, -87.5)
    latitudes_frame = (41.6, 42.1)
    num_of_spatial_centroids = 50
    spatial_alpha_vec = np.zeros(num_of_spatial_centroids)  # np.random.uniform(-1, 1, num_of_spatial_centroids)
    spatial_sigma_vec = np.var((-88, -87.5)) * np.ones(num_of_spatial_centroids, dtype=float)
    spatial_mean_vec = np.column_stack(
        (
            np.random.uniform(longitudes_frame[0], longitudes_frame[1], num_of_spatial_centroids),
            np.random.uniform(latitudes_frame[0], latitudes_frame[1], num_of_spatial_centroids),
        )
    )
    l2_regularization = 0.1

    theta_init = compress_params(spatial_alpha_vec, spatial_mean_vec, spatial_sigma_vec)
    gradient_check(theta_init, x, y, l2_regularization)
    theta_optimum = fmin_bfgs(
        calculate_cost, theta_init, fprime=calculate_gradient, args=(x, y, l2_regularization), maxiter=5e4
    )
    print theta_optimum
    spatial_alpha_vec, spatial_mean_vec, spatial_sigma_vec = span_params(theta_optimum)
    print "Mean absolut error: ", (2 * calculate_cost(theta_optimum, x, y, 0) / len(y)) ** 0.5

    with open(
        "../output/mosquitoes_count_spatial_gaussian_regression.pickle", "wb"
    ) as out_file:  # This is the generalized regressor
        pickle.dump(theta_optimum, out_file)

    mapdata = np.loadtxt("../input/mapdata_copyright_openstreetmap_contributors.txt")
    aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
    lon_lat_box = (-88, -87.5, 41.6, 42.1)
    plt.figure("spatial", figsize=(10, 14))
    plt.imshow(mapdata, cmap=plt.get_cmap("gray"), extent=lon_lat_box, aspect=aspect)

    levels = [0.2, 0.4, 0.6, 0.8, 1.0]
    lon = np.linspace(-88, -87.5, 1000)
    lat = np.linspace(41.6, 42.1, 1000)
    [A, B] = np.meshgrid(lon, lat)
    g = calculate_spatial_dist(
        A.reshape(-1), B.reshape(-1), spatial_mean_vec, spatial_sigma_vec, spatial_alpha_vec
    ).reshape(len(lon), len(lat))
    # g = np.zeros((len(lon), len(lat)))
    # for i in xrange(len(lon)):
    #     for j in xrange(len(lat)):
    #         g[i, j] = calculate_spatial_dist(lon[i], lat[j], spatial_mean_vec, spatial_sigma_vec, spatial_alpha_vec)
    # contour the gridded data, plotting dots at the randomly spaced data points.
    # CS = plt.contour(A, B, g, len(levels),linewidths=0.5,colors='k', levels=levels)
    CS = plt.contour(A, B, g, np.linspace(-0.1, 0.4, 1000))
    plt.colorbar()
    plt.show()