def extract_background(params, data):
    """ """
    if params['background_rank'] > 0:
        print_and_flush('Fitting low-rank background...')
        U, s, Vt = svd(M=data.reshape((-1, data.shape[-1])),
                       n_components=params['background_rank'])
        U = np.reshape(U * s[None, :], data.shape[:2] + U.shape[-1:])
        print_and_flush('Removing background from dataset...')
        data = data - np.dot(U, Vt)
        print_and_flush('Background successfully extracted.')
        return data, (U, Vt)
    return data, (None, None)
def SVD_classifier(df, period_of_interest, prediction_year=2012, \
                   epidemic_classification_dict=None, training_year_window='ALL',\
                    t0_vector=None, p_vector=None, classifier='SVM', modes=[0,1], verbose=False):
    '''

    - p_max, p_min: sets the bounds for the period length vector
    - period_of_interest = () #initial and final date that contains the period of interest (poi).
    the period of interest defines the starting and finishing dates for the SVD classifierself.
    e.g. If poi is 01-02-YYYY through 28-02-YYYY, SVD classifier's heatmap will start on 28-02 of previous year and end
    on 01-02 of the next year
    -prediction_year
    -epidemic_classification_dict = dictionary. e.g. {'2001':1, '2002':0, '2003':1}
    '''

    #Generate grid based on p and t0 vectors
    distance_grid = np.zeros([len(p_vector), len(t0_vector)])

    years = []
    for i in range(df.index.shape[0]):
        years.append(df.index[i].year)
    years = list(set(years))

    years_before_prediction = years.index(prediction_year)

    if training_year_window == 'ALL':
        training_years = years[0:years_before_prediction]
        n_years = years_before_prediction
    elif training_year_window < years_before_prediction:
        training_years = years[years_before_prediction -
                               training_year_window:years_before_prediction]
        n_years = training_year_window
    else:
        print(
            "Can't retrieve training window: {0}. Place make sure training window is 'ALL' or an int number within the number of years size"
            .format(training_year_window))

    if verbose:
        print('{0} years detected within dataframe: {1}.'.format(
            len(years), years))
        print('{0} Years before prediction: {1}'.format(
            n_years, training_years))

    # check if t0 dates are within
    dates_within_poi = []
    for d in t0_vector:
        if '{0}'.format(prediction_year) + d[4:] in df[
                period_of_interest[0]:period_of_interest[1]].index:
            dates_within_poi.append(d)

    if len(d) > 0:
        print(
            '{0} dates from t0_vector are inside period_of_interest range: {1}'
            .format(len(dates_within_poi), dates_within_poi))

    #Enter main loop
    for i, p in enumerate(p_vector):
        for j, t0 in enumerate(t0_vector):

            if verbose: print('Reshaping data')
            X = SVDC_reshape_yearly_data_stolerman(df=df, t0=t0, p=p,\
                                                   years=training_years, \
                                                   upper_bound=period_of_interest[0],\
                                                   normalize=True, verbose=False)

            if verbose: print('Reshaping data done')
            '''
            Each column of X represents one year of data in the order of years_before_prediction. If we want out classification at year Y
            we need Y-1 as out of sample input and Y-2, Y-3...1 as our training dataset. As we're trying to classify every Y with previous year data, we also assign
            the epidemic classification of year Y to the label for Y-1
            '''
            if X is not None:

                X_train = X[:, :-1]
                X_predict = X[:, -1]
                Y_train = []
                for year in training_years[:
                                           -1]:  # Can take out of loop but keeping for clear reading purposes
                    Y_train.append(epidemic_classification_dict[year + 1])

                Y_train = np.vstack(Y_train)
                Y_predict = epidemic_classification_dict[prediction_year]

                # Perform svd
                U, sigma, VT = svd(X_train,
                                   n_components=3,
                                   n_iter=15,
                                   random_state=None)
                projections = sigma.reshape([-1, 1]) * VT
                projections = projections.T
                projections = projections[:, modes]
                '''
                Now that we got our projections from SVD we can create the classifier
                '''
                mod = svm.SVC(kernel='rbf',
                              gamma=1,
                              C=1,
                              cache_size=400,
                              max_iter=100000)
                if verbose:
                    ('Fitting with projections shape {0} and target shape {1}'.
                     format(projections.shape, Y_predict))
                mod.fit(projections, Y_train.ravel())
                pred = mod.predict(
                    np.matmul(X_predict.reshape([1, -1]), U[:, modes]))

                distance_grid[i, j] = (pred == Y_predict)
            else:
                distance_grid[i, j] = -1

    return distance_grid
Exemple #3
0
def SVDC_deploy(df,
                period_of_interest,
                variables=['precip', 'temp'],
                add_runoff_binary=False,
                prediction_year=2012,
                epidemic_classification_dict=None,
                first_training_year=2000,
                t0_vector=None,
                p_vector=None,
                classifier='forest',
                modes=[0, 1],
                decision_map=None,
                decision_coordinates=None,
                decision_values=None,
                clustering=True,
                verbose=False,
                tiebreaker=True):
    '''
    SVD_decision_ensemble performs a decision ensemble based on a series of decision groups generated by
    a clustering analysis.

    #Clustering process. After
    - p_max, p_min: sets the bounds for the period length vector
    - period_of_interest = () #initial and final date that contains the period of interest (poi).
    the period of interest defines the starting and finishing dates for the SVD classifierself.
    e.g. If poi is 01-02-YYYY through 28-02-YYYY, SVD classifier's heatmap will start on 28-02 of previous year and end
    on 01-02 of the next year
    -prediction_year
    -epidemic_classification_dict = dictionary. e.g. {'2001':1, '2002':0, '2003':1}
    '''
    print('Starting SVDC with following parameters:')
    print('Features: {0}'.format(variables))
    print('SVD modes : {0}'.format(modes))
    print('runoff_binary : {0}'.format(add_runoff_binary))
    print('Classifier : {0}'.format(classifier))
    time.sleep(3)
    #Find decision groups
    decision_value_min = decision_values[0]
    decision_value_max = decision_values[1]
    if verbose:
        print('Decision values {0}, {1}'.format(decision_value_min,
                                                decision_value_max))
        print('decision_map min and max values: {0}, {1}.'.format(
            np.min(decision_map), np.max(decision_map)))
        print('MODES:{0}'.format(modes))

    #turns a nxm decision map into a set of samples.
    # matrix = bidimensional numpy array
    if verbose:
        print("Plotting decision map. Please verify everything's correct.")
        a = plt.subplot(1, 1, 1)
        a_im = a.matshow(decision_map,
                         cmap=plt.cm.hot,
                         aspect='auto',
                         origin='lower')
        plt.ylabel('Decision Map')
        a.yaxis.set_label_position("right")
        a.xaxis.tick_bottom()
        plt.colorbar(a_im, ax=a)
        a.set_xticks(list(range(len(decision_coordinates[0]))), minor=False)
        a.xaxis.set_major_formatter(IndexFormatter(decision_coordinates[0]))
        a.xaxis.set_major_locator(mticker.MaxNLocator(8))
        plt.xticks(rotation=40)
        a.xaxis.set_major_locator(mticker.MaxNLocator(5))
        a.axes.get_xaxis().set_visible(False)
        #plt.show()
        plt.close()

    decision_map[~((decision_map >= decision_value_min) &
                   (decision_map <= decision_value_max))] = 0
    rows, columns = np.where((decision_map > 0))
    roi = np.vstack([rows, columns]).T

    cluster_weights = []
    cluster_coordinates = []
    cluster_t0 = []
    cluster_p = []
    total_value_sum = 0

    if clustering:

        #We use a clustering algorithm to find the decision clusters
        db = DBSCAN(eps=5, min_samples=40).fit(roi)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_

        # Find coordinates for decision clusters

        indices = [i for i, label in enumerate(labels) if label > -1]
        n_clusters = np.max(labels) + 1

        for cluster_number in range(n_clusters):

            cluster_mask = np.equal(labels, cluster_number)
            cluster_indices = [
                i for i, val in enumerate(cluster_mask) if val == 1
            ]

            cluster_sum = 0

            for ind in cluster_indices:  # For each sample within cluster

                #get sample coordinates within decision_map
                p_coordinate = roi[ind, 0]
                t0_coordinate = roi[ind, 1]

                # get t0 and p coordinates
                cluster_t0.append(decision_coordinates[0][t0_coordinate])
                cluster_p.append(decision_coordinates[1][p_coordinate])
                cluster_sum += decision_map[p_coordinate, t0_coordinate]

            total_value_sum += cluster_sum
            cluster_coordinates.append(cluster_indices)
            cluster_weights.append(cluster_sum)
    else:
        #If clustering is set to false, we grab all the regions as one big cluster
        cluster_indices = list(range(len(rows)))
        cluster_sum = 0
        n_clusters = 1
        labels = np.array([1] * len(rows))
        if len(cluster_indices) % 2 == 0:
            cluster_indices.pop()
        for ind in cluster_indices:  # For each sample within cluster
            #get sample coordinates within decision_map
            p_coordinate = roi[ind, 0]
            t0_coordinate = roi[ind, 1]

            # get t0 and p coordinates
            cluster_t0.append(decision_coordinates[0][t0_coordinate])
            cluster_p.append(decision_coordinates[1][p_coordinate])
            cluster_sum += decision_map[p_coordinate, t0_coordinate]

        total_value_sum += cluster_sum
        cluster_coordinates.append(cluster_indices)
        cluster_weights.append(cluster_sum)

    cluster_weights = np.array(cluster_weights) / total_value_sum  #Normalizing
    all_indices = np.hstack(cluster_coordinates)

    if verbose:
        print('{0} decision clusters were found'.format(np.max(labels + 1)))
        print(
            'Plotting clustered grid, displaying only areas of interest and clusters with different colors'
        )
        clustered_grid = np.zeros_like(decision_map)
        for i, label in enumerate(labels):
            clustered_grid[roi[i, 0], roi[i, 1]] = label + 1
        fig = plt.figure()
        a = plt.subplot(2, 1, 1)
        a.matshow(decision_map, cmap=plt.cm.hot, aspect='auto', origin='lower')
        #a.colorbar()
        plt.title('Original decision_map')
        b = plt.subplot(2, 1, 2)
        b.matshow(clustered_grid,
                  cmap=plt.cm.hot,
                  aspect='auto',
                  origin='lower',
                  vmin=0,
                  vmax=np.max(labels) +
                  1)  # pl is pylab imported a pl plt.cm.hot
        #a.colorbar()
        plt.title('Clusters with classifying acc {0} to {1}'.format(
            decision_value_min, decision_value_max))
        #plt.show()
        time.sleep(1)
        plt.close()

    if verbose:
        print('Cluster_weights = {0}'.format(cluster_weights))
        print('cluster_coordinates = {0}'.format(cluster_coordinates))
        print('all_indices {0}'.format(all_indices[:]))

    #Generate grid based on p and t0 vectors
    distance_grid = np.zeros_like(decision_map)
    years = []
    for i in range(df.index.shape[0]):
        years.append(df.index[i].year)
    years = sorted(list(set(years)))

    if verbose:
        print(years)

    if prediction_year in years and first_training_year in years:
        training_years = years[years.index(first_training_year):years.
                               index(prediction_year)]
        n_years = len(training_years)
    else:
        print('Missing either prediction_year or first_training_year')
        time.sleep(10)
        return
    '''
    years_before_prediction = years.index(prediction_year)


    training_years = years[0:years_before_prediction]
    n_years = years_before_prediction
    '''

    if verbose:
        print('{0} years detected within dataframe: {1}.'.format(
            len(years), years))
        print('{0} Years before prediction: {1}'.format(
            n_years, training_years))

    # check if t0 dates are within poi
    dates_within_poi = []
    for d in cluster_t0:
        if '{0}'.format(prediction_year) + d[4:] in df[
                period_of_interest[0]:period_of_interest[1]].index:
            dates_within_poi.append(d)

    if len(dates_within_poi) > 0:
        print(
            '{0} dates from t0_vector are inside period_of_interest range: {1}'
            .format(len(dates_within_poi), dates_within_poi))

    #Enter main loop
    print('Initiating heatmap loop.')
    bar = Bar('Processing', max=len(cluster_p))
    for p, t0, ind in zip(cluster_p, cluster_t0, all_indices):
        bar.next()
        i = roi[ind, 0]
        j = roi[ind, 1]

        if verbose: print('Reshaping data')
        X = SVDC_reshape_yearly_data_stolerman(df=df[variables], t0=t0, p=p,\
                                               years=training_years, \
                                               upper_bound=period_of_interest[0],\
                                               normalize=True, verbose=False)

        if verbose: print('Reshaping data done')
        '''
        Each column of X represents one year of data in the order of years_before_prediction. If we want out classification at year Y
        we need Y-1 as out of sample input and Y-2, Y-3...1 as our training dataset. As we're trying to classify every Y with previous year data, we also assign
        the epidemic classification of year Y to the label for Y-1
        '''
        if X is not None:

            X_train = X[:, :-1]
            X_predict = X[:, -1]
            Y_train = []
            for year in training_years[:
                                       -1]:  # Can take out of loop but keeping for clear reading purposes
                Y_train.append(epidemic_classification_dict[year + 1])

            Y_train = np.vstack(Y_train)

            # Perform svd
            U, sigma, VT = svd(X_train,
                               n_components=3,
                               n_iter=15,
                               random_state=None)
            U, sigma, VT = svd(X_train,
                               n_components=3,
                               n_iter=15,
                               random_state=None)
            projections = sigma.reshape([-1, 1]) * VT
            projections = projections.T

            projections = np.vstack([
                projections[:, modes],
                np.matmul(X_predict.reshape([1, -1]), U[:, modes])
            ])

            if add_runoff_binary:
                # This function returns the delta value stated in Stolerman's paper
                average_runoff = SVDC_get_runoffbinary(df=df, t0=t0, p=p,\
                                                       years=training_years, \
                                                       upper_bound=period_of_interest[0],\
                                                       normalize=True, verbose=False)
                classifier_dataset = np.hstack([projections, average_runoff])
            else:
                classifier_dataset = projections

            classifier_dataset_train = classifier_dataset[:-1, :]
            classifier_dataset_predict = classifier_dataset[-1, :]
            '''
            Now that we got our projections from SVD we can create the classifier
            '''
            if classifier == 'svm':
                mod = svm.SVC(kernel='rbf',
                              gamma=1,
                              C=1,
                              cache_size=400,
                              max_iter=100000)

            elif classifier == 'forest':
                mod = RandomForestClassifier(n_estimators=10,
                                             max_depth=2,
                                             random_state=0)

            if verbose:
                print('Fitting with projections shape {0}'.format(
                    projections.shape))
                print(Y_train, training_years)

            mod.fit(classifier_dataset_train, Y_train.ravel())
            pred = mod.predict(classifier_dataset_predict.reshape(1, -1))
            distance_grid[i, j] = pred

    bar.finish()
    cluster_decisions = []
    for cluster_number in range(n_clusters):
        accumulated_decision = 0
        indices = cluster_coordinates[cluster_number]

        for p_coordinate, t0_coordinate in roi[indices, :]:
            accumulated_decision += distance_grid[
                p_coordinate, t0_coordinate] * decision_map[
                    p_coordinate,
                    t0_coordinate]  #Decision weighted by classifier accuracy

        if accumulated_decision > 0:
            cluster_decisions.append(1)
        elif accumulated_decision < 0:
            cluster_decisions.append(-1)
        else:
            cluster_decisions.append(0)

    cluster_decisions = np.array(cluster_decisions)
    final_decision = np.sum(cluster_decisions * cluster_weights)

    if verbose:
        fig, axarr = plt.subplots(3, 1, figsize=[4.5, 10])

        #axarr[0]= plt.subplot(3,1,1)
        a_im = axarr[0].matshow(decision_map,
                                cmap=plt.cm.hot,
                                aspect='auto',
                                origin='lower',
                                vmin=0,
                                vmax=1)
        axarr[0].axes.get_xaxis().set_visible(False)
        plt.colorbar(a_im, ax=axarr[0])
        axarr[0].set_title(
            'Decision map (Year to predict = {2}) \n (classifying acc {0} to {1})'
            .format(decision_value_min, decision_value_max, prediction_year))
        axarr[0].yaxis.set_label_position("right")
        #b=plt.subplot(3,1,2)
        b_im = axarr[1].matshow(clustered_grid,
                                cmap=plt.cm.tab20c,
                                aspect='auto',
                                origin='lower',
                                vmin=0,
                                vmax=np.max(labels) +
                                1)  # pl is pylab imported a pl plt.cm.hot
        plt.colorbar(b_im, ax=axarr[1])
        axarr[1].axes.get_xaxis().set_visible(False)
        axarr[1].yaxis.set_label_position("right")
        axarr[1].set_title('Clusters (N={0})'.format(n_clusters))
        #c=plt.subplot(3,1,3)
        c_im = axarr[2].matshow(distance_grid,
                                cmap=plt.cm.hot,
                                aspect='auto',
                                origin='lower',
                                vmin=-1,
                                vmax=1)  # pl is pylab imported a pl plt.cm.hot
        plt.colorbar(c_im, ax=axarr[2])
        axarr[2].set_title(
            'Cluster decisions (final={0})'.format(final_decision))
        axarr[2].yaxis.set_label_position("right")
        axarr[2].set_xticks(list(range(len(decision_coordinates[0]))),
                            minor=False)
        axarr[2].xaxis.set_major_formatter(
            IndexFormatter(decision_coordinates[0]))
        axarr[2].xaxis.set_major_locator(mticker.MaxNLocator(8))
        axarr[2].xaxis.tick_bottom()
        plt.xticks(rotation=40)
        plt.close()
    else:
        axarr = None
        fig = None

    votes_against = np.sum(distance_grid[distance_grid == -1])
    votes_favor = np.sum(distance_grid[distance_grid == 1])
    total_votes = len(cluster_p)

    if verbose:
        print(
            'Decision ensemble finished with the following votes for each cluster (1 in favor, -1 against) \n'
        )
        for c in range(n_clusters):
            print('Cluster {0} = {1}'.format(c + 1, cluster_decisions[c]))
        print(
            'Puntual Decision (decision for each classifier) distribution. \n \
              {0} in favor ({1}%).\n {2} against ({3}%). \n Total votes {4}'.
            format(votes_favor, votes_favor / total_votes, votes_against,
                   votes_against / total_votes, total_votes))

    return final_decision, cluster_decisions, cluster_weights, distance_grid, fig, votes_favor, total_votes
Exemple #4
0
def SVDC_cross_validation(df, period_of_interest,first_year_insample = 2001,last_year_insample = 2006, prediction_year=2001, \
                           epidemic_classification_dict=None, training_year_window='ALL', t0_vector=None, \
                           p_vector=None, classifier='SVM', modes=[0], add_peaks=False,\
                           add_runoff_binary=False, verbose=True, variables=['precip', 'temp']):
    '''

    - p_max, p_min: sets the bounds for the period length vector
    - period_of_interest = () #initial and final date that contains the period of interest (poi).
    the period of interest defines the starting and finishing dates for the SVD classifierself.
    e.g. If poi is 01-02-YYYY through 28-02-YYYY, SVD classifier's heatmap will start on 28-02 of previous year and end
    on 01-02 of the next year
    -prediction_year
    -epidemic_classification_dict = dictionary. e.g. {'2001':1, '2002':0, '2003':1}


    v2.:
    Version two of heatmap generators utilized 3 modes rather than 2 and also incorporates the average number of peaks
    as extra dimensions prior to the classifier phase
    '''

    #Generate grid based on p and t0 vectors
    distance_grid = np.zeros([len(p_vector), len(t0_vector)])

    years = []
    for i in range(df.index.shape[0]):
        years.append(df.index[i].year)
    years = sorted(list(set(years)))

    years_before_prediction = years.index(last_year_insample)
    if verbose:
        print('Years before prediction')
        print(years_before_prediction)
        time.sleep(10)

    if training_year_window == 'ALL':
        training_years = years[0:years_before_prediction]
        n_years = years_before_prediction
    elif training_year_window < years_before_prediction:
        training_years = years[years_before_prediction -
                               training_year_window:years_before_prediction]
        n_years = training_year_window
    else:
        print(
            "Can't retrieve training window: {0}. Place make sure training window is 'ALL' or an int number within the number of years size"
            .format(training_year_window))

    if verbose:
        print('{0} years detected within dataframe: {1}.'.format(
            len(years), years))
        print('{0} Years before prediction: {1}'.format(
            n_years, training_years))

    # check if t0 dates are within
    dates_within_poi = []
    for d in t0_vector:
        if '{0}'.format(last_year_insample) + d[4:] in df[
                period_of_interest[0]:period_of_interest[1]].index:
            dates_within_poi.append(d)

    if len(d) > 0:
        print(
            '{0} dates from t0_vector are inside period_of_interest range: {1}'
            .format(len(dates_within_poi), dates_within_poi))

    #Enter main loop
    print('Initiating heatmap loop.')
    bar = Bar('Processing', max=len(p_vector))
    for i, p in enumerate(p_vector):
        bar.next()
        for j, t0 in enumerate(t0_vector):

            if verbose:
                print('Reshaping data with upper bound : {0}'.format(
                    period_of_interest[0]))
                time.sleep(10)
            X = SVDC_reshape_yearly_data_stolerman(df=df[variables], t0=t0, p=p,\
                                                   years=training_years, \
                                                   upper_bound=period_of_interest[0],\
                                                   normalize=True, verbose=False)

            if verbose: print('Reshaping data done')
            '''
            Each column of X represents one year of data in the order of years_before_prediction. If we want out classification at year Y
            we need Y-1 as out of sample input and Y-2, Y-3...1 as our training dataset. As we're trying to classify every Y with previous year data, we also assign
            the epidemic classification of year Y to the label for Y-1
            '''
            if X is not None:

                # In the in sample run, the X_predict vector corresponds to the column in the matrix that
                # Contains the data form prediction_year. Columns are ordered  in ascending order (2000,2001,2002...)
                # Assume prediction year is within in sample window (2001 - 2001 gives 0 column, 2002-2001 gives 1 column, etc)
                # Also assume there are no missing years
                X_predict_column = prediction_year - first_year_insample
                X_train_columns = list(range(n_years))
                X_train_columns.remove(X_predict_column)
                X_train_columns = np.array(X_train_columns)

                X_train = X[:, X_train_columns]
                X_predict = X[:, X_predict_column]
                Y_train = []
                in_sample_years = list(
                    range(first_year_insample, last_year_insample + 1))
                in_sample_years.remove(prediction_year)

                for year in in_sample_years:  # Can take out of loop but keeping for clear reading purposes
                    Y_train.append(epidemic_classification_dict[year])

                Y_train = np.vstack(Y_train)
                Y_predict = epidemic_classification_dict[prediction_year]

                if verbose:
                    print('Data: {0}'.format(X))
                    print('Shape: {0}'.format(X.shape))
                    print('prediction_year:{0}, column:{1}'.format(
                        prediction_year, X_predict_column))
                    print('Training years:{0}, columns:{1}'.format(
                        in_sample_years, X_train_columns))
                    print('Train_x : {0}, predict_x : {1}'.format(
                        X_train, X_predict))
                    print('DICT: {0}'.format(epidemic_classification_dict))
                    print('Y_predict : {0}, prediction_year:{1}'.format(
                        Y_predict, prediction_year))
                    print('Y_train : {0}, in_sample_years:{1}'.format(
                        Y_train, in_sample_years))
                    time.sleep(100)
                # Perform svd
                U, sigma, VT = svd(X_train,
                                   n_components=3,
                                   n_iter=15,
                                   random_state=None)
                projections = sigma.reshape([-1, 1]) * VT
                projections = projections.T
                projections = np.vstack([
                    projections[:, modes],
                    np.matmul(X_predict.reshape([1, -1]), U[:, modes])
                ])
                '''
                if not np.equal(projections[-1,:], np.matmul(X_predict.reshape([1,-1]),U[:,modes]).reshape(1,-1)).all():
                    print('WARNING! projections and prediction sample matmul are not equal')
                    print(projections[-1,:], np.matmul(X_predict.reshape([1,-1]),U[:,modes]))
                    time.sleep(10)
                if verbose:
                    print('Verifying predict_projection is correct = {0},{1}, {2}'.format(projections,projection_predict, np.matmul(X_predict.reshape([1,-1]),U[:,modes])))
                '''
                '''
                Merging SVD projections average_peak_frequencies for each year. They should have the same length
                '''

                if add_peaks:
                    # This function returns the delta value stated in Stolerman's paper
                    average_peak_frequencies = SVDC_get_apfs(df=df, t0=t0, p=p,\
                                                           years=training_years, \
                                                           upper_bound=period_of_interest[0],\
                                                           normalize=True, verbose=False)
                    classifier_dataset = np.hstack(
                        [projections, average_peak_frequencies])
                else:
                    classifier_dataset = projections

                if add_runoff_binary:
                    # This function returns the delta value stated in Stolerman's paper
                    average_runoff = SVDC_get_runoffbinary(df=df, t0=t0, p=p,\
                                                           years=training_years, \
                                                           upper_bound=period_of_interest[0],\
                                                           normalize=True, verbose=False)
                    classifier_dataset = np.hstack(
                        [projections, average_runoff])

                else:
                    classifier_dataset = projections

                classifier_dataset_train = classifier_dataset[:-1, :]
                classifier_dataset_predict = classifier_dataset[-1, :]

                if verbose:
                    print(classifier_dataset_train, classifier_dataset_predict)
                if classifier == 'svm':
                    mod = svm.SVC(kernel='rbf',
                                  gamma=1,
                                  C=1,
                                  cache_size=400,
                                  max_iter=100000)
                elif classifier == 'forest':
                    mod = RandomForestClassifier(n_estimators=10,
                                                 max_depth=2,
                                                 random_state=0)
                if verbose:
                    ('Fitting with projections shape {0} and target shape {1}'.
                     format(classifier_dataset_train.shape, Y_predict))

                mod.fit(classifier_dataset_train, Y_train.ravel())
                pred = mod.predict(classifier_dataset_predict.reshape(1, -1))
                distance_grid[i, j] = (pred == Y_predict)
            else:
                distance_grid[i, j] = -1
    bar.finish()
    return distance_grid
Exemple #5
0
year_epidemic_classification = year_epidemic_classification[0:n_years]
modes = [0, 1]  #starts from zero

#Enter main loop
for i, p in enumerate(p_vector):
    for j, t0 in enumerate(t0_vector):
        print(p, t0)

        X = SVDC_reshape_yearly_data_stolerman(df,
                                               t0=t0,
                                               p=p,
                                               years,
                                               normalize=True)

        # Perform svd
        U, sigma, VT = svd(X, n_components=3, n_iter=15, random_state=None)
        projections = sigma.reshape([-1, 1]) * VT
        projections = projections.T

        projections = projections[:, modes]

        hull_1, hull_2, coordinates_1, coordinates_2 = get_hulls(
            projections, year_epidemic_classification)

        intersect_boolean, vertices_1, vertices_2 = intersect_hulls(
            hull_1, hull_2, coordinates_1, coordinates_2)

        #if not intersect_boolean:
        #    plot_hulls(projections, hull_1, hull_2, coordinates_1, coordinates_2, year_epidemic_classification, class_color)
        #    plot_polygon(projections, hull_1, hull_2, coordinates_1, coordinates_2, year_epidemic_classification, class_color)