Esempio n. 1
0
                                         axis=1)

            fulldata = fulldata[period_start_index:period_end_index, :]
            fulldata_mask = fulldata_mask[
                period_start_index:period_end_index, :]

            print(fulldata.shape)

            print("Fulldata shape = %s" % str(fulldata.shape))
            print("Fulldata masked shape = %s" % str(fulldata_mask.shape))
            print("Unmasked samples %d" % (fulldata_mask[:, 0] == False).sum())

            print("Aggregating data to time_bin_length=%s" % time_bin_length)

            ## Time bin data
            fulldata = pp.time_bin_with_mask(
                fulldata, time_bin_length=time_bin_length)[0]
            fulldata_mask = pp.time_bin_with_mask(
                fulldata_mask, time_bin_length=time_bin_length)[0] > 0.
            print("Fulldata after binning shape = %s" % str(fulldata.shape))
            print("Fulldata after binning masked shape = %s" %
                  str(fulldata_mask.shape))

            # # Only use selected indices
            # selected_comps_indices=[]
            # for i in selected_components:
            #     selected_comps_indices.append(int(comps_order_file['comps'][i]))
            #
            # fulldata = fulldata[:, selected_comps_indices]
            # fulldata_mask = fulldata_mask[:, selected_comps_indices]

            dataframe = pp.DataFrame(fulldata, mask=fulldata_mask)
Esempio n. 2
0
def get_varimax_loadings(
    geo_object,
    month_mask=None,
    truncate_by='max_comps',
    max_comps=60,
    fraction_explained_variance=0.9,
    verbosity=0,
):

    if verbosity > 0:
        print("Get Varimax components")
        print("\tGet SVD")
    data = geo_object.data()
    print(data.shape, ' data shape after loading')
    if month_mask is not None:
        if verbosity > 0:
            print(
                "\tCompute covariance only from months %s" % month_mask +
                "\n\t(NOTE: Time series will be all months and mask can be retrieved from dict)"
            )

        masked_data, time_mask = geo_object.return_masked_months(month_mask)
    else:
        masked_data = data
        time_mask = np.zeros(data.shape[0], dtype='bool')
    print masked_data.shape, ' masked data shape before reshape'
    data = np.reshape(data, (data.shape[0], np.prod(
        data.shape[1:])))  # flattening field of daily data
    masked_data = np.reshape(
        masked_data, (masked_data.shape[0], np.prod(
            data.shape[1:])))  # flattening field of daily data
    print masked_data.shape, data.shape, 'masked and data after reshape'
    # define the time average of daily data; time_bin_length
    masked_data, Tbin = pp.time_bin_with_mask(masked_data, time_bin_length=3)
    print masked_data.shape, ' masked data shape after binning'
    # Get truncated SVD
    V, U, S, ts_svd, eig, explained, max_comps = pca_svd(
        masked_data,
        truncate_by=truncate_by,
        max_comps=max_comps,
        fraction_explained_variance=fraction_explained_variance,
        verbosity=verbosity)
    # if verbosity > 0:
    #     print("Explained variance at max_comps = %d: %.5f" % (max_comps, explained))

    if verbosity > 0:
        if truncate_by == 'max_comps':

            print(
                "\tUser-selected number of components: %d\n"
                "\tExplaining %.2f of variance" % (max_comps, explained))

        elif truncate_by == 'fraction_explained_variance':

            print(
                "\tUser-selected explained variance: %.2f of total variance\n"
                "\tResulting in %d components" % (explained, max_comps))

    if verbosity > 0:
        print("\tVarimax rotation")
    # Rotate
    Vr, Rot = varimax(V, verbosity=verbosity)
    # Vr = V
    # Rot = np.diag(np.ones(V.shape[1]))
    # print Vr.shape
    Vr = svd_flip(Vr)

    if verbosity > 0:
        print("\tFurther metrics")
    # Get explained variance of rotated components
    s2 = np.diag(S)**2 / (masked_data.shape[0] - 1.)

    # matrix with diagonal containing variances of rotated components
    S2r = np.dot(np.dot(np.transpose(Rot), np.matrix(np.diag(s2))), Rot)
    expvar = np.diag(S2r)

    sorted_expvar = np.sort(expvar)[::-1]
    # s_orig = ((Vt.shape[1] - 1) * s2) ** 0.5

    # reorder all elements according to explained variance (descending)
    nord = np.argsort(expvar)[::-1]
    Vr = Vr[:, nord]

    # Get time series of UNMASKED data
    comps_ts = data.dot(Vr)

    comps_ts_masked = masked_data.dot(Vr)

    # Get location of absmax
    comp_loc = {'x': np.zeros(max_comps), 'y': np.zeros(max_comps)}
    for i in range(max_comps):
        coords = np.unravel_index(np.abs(Vr[:, i]).argmax(),
                                  dims=(len(geo_object.lats),
                                        len(geo_object.lons)))
        comp_loc['x'][i] = geo_object.lons[coords[1]]
        comp_loc['y'][i] = geo_object.lats[coords[0]]

    total_var = np.sum(np.var(masked_data, axis=0))

    # print time_mask
    # print expvar
    # start_end = (str(date.fromordinal(int(geo_object.tm[0]))),
    #                   str(date.fromordinal(int(geo_object.tm[-1]))))
    start_end = (str(geo_object.start_date), str(geo_object.end_date))

    # print start_end_year

    return {
        'weights': np.copy(Vr),
        'ts_unmasked': comps_ts,
        'ts_masked': comps_ts_masked,
        'explained_var': sorted_expvar,
        'unrotated_weights': V,
        'explained': explained,
        'pca_eigs': eig,
        'truncate_by': truncate_by,
        'max_comps': max_comps,
        'fraction_explained_variance': fraction_explained_variance,
        'total_var': total_var,
        'month_mask': month_mask,
        'comps_max_loc': comp_loc,
        'time_mask': time_mask,
        'start_end': start_end,
        'time': geo_object.tm,
        'lats': geo_object.lats,
        'lons': geo_object.lons,
    }
Esempio n. 3
0
def get_results_from_weights(geo_object, weights_filename, verbosity=0):

    weights_dict_results = cPickle.load(open(weights_filename))['results']
    results = weights_dict_results.copy()
    weights = results['weights']
    month_mask = results['month_mask']
    max_comps = results['max_comps']
    truncate_by = results['truncate_by']
    fraction_explained_variance = results['fraction_explained_variance']

    if verbosity > 0:
        print("Get Varimax components from %s" % weights_filename)
    data = geo_object.data()
    print(data.shape, ' data shape after loading')
    if month_mask is not None:
        if verbosity > 0:
            print(
                "\tCompute covariance only from months %s" % month_mask +
                "\n\t(NOTE: Time series will be all months and mask can be retrieved from dict)"
            )

        masked_data, time_mask = geo_object.return_masked_months(month_mask)
    else:
        masked_data = data
        time_mask = np.zeros(data.shape[0], dtype='bool')
    print masked_data.shape, ' masked data shape before reshape'
    data = np.reshape(data, (data.shape[0], np.prod(
        data.shape[1:])))  # flattening field of daily data
    masked_data = np.reshape(
        masked_data, (masked_data.shape[0], np.prod(
            data.shape[1:])))  # flattening field of daily data
    print masked_data.shape, data.shape, 'masked and data after reshape'
    masked_data, Tbin = pp.time_bin_with_mask(masked_data, time_bin_length=3)
    print masked_data.shape, ' masked data shape after binning'
    # # Get truncated SVD
    # V, U, S, ts_svd, eig, explained, max_comps = pca_svd(masked_data, truncate_by=truncate_by, max_comps=max_comps,
    #                                fraction_explained_variance=fraction_explained_variance,
    #                                 verbosity=verbosity)
    #     # if verbosity > 0:
    #     print("Explained variance at max_comps = %d: %.5f" % (max_comps, explained))

    # if verbosity > 0:
    #     if truncate_by == 'max_comps':

    #         print("\tUser-selected number of components: %d\n"
    #               "\tExplaining %.2f of variance" %(max_comps, explained))

    #     elif truncate_by == 'fraction_explained_variance':

    #         print("\tUser-selected explained variance: %.2f of total variance\n"
    #               "\tResulting in %d components" %(explained, max_comps))

    # if verbosity > 0:
    #     print("\tVarimax rotation")
    # # Rotate
    # Vr, Rot = varimax(V, verbosity=verbosity)
    # # Vr = V
    # # Rot = np.diag(np.ones(V.shape[1]))
    # # print Vr.shape
    # Vr = svd_flip(Vr)

    # if verbosity > 0:
    #     print("\tFurther metrics")
    # # Get explained variance of rotated components
    # s2 = np.diag(S)**2 / (masked_data.shape[0] - 1.)

    # # matrix with diagonal containing variances of rotated components
    # S2r = np.dot(np.dot(np.transpose(Rot), np.matrix(np.diag(s2))), Rot)
    # expvar = np.diag(S2r)

    # sorted_expvar = np.sort(expvar)[::-1]
    # # s_orig = ((Vt.shape[1] - 1) * s2) ** 0.5

    # # reorder all elements according to explained variance (descending)
    # nord = np.argsort(expvar)[::-1]
    # Vr = Vr[:, nord]

    if verbosity > 0:
        print("\tCompute components using weights ")
    # Get time series of UNMASKED data
    comps_ts = data.dot(weights)

    # print comps_ts[:2]
    # print weights_dict_results['ts_unmasked'][:2]
    # assert np.allclose(comps_ts, weights_dict_results['ts_unmasked'])

    comps_ts_masked = masked_data.dot(weights)

    # Get location of absmax
    comp_loc = {'x': np.zeros(max_comps), 'y': np.zeros(max_comps)}
    for i in range(max_comps):
        coords = np.unravel_index(np.abs(weights[:, i]).argmax(),
                                  dims=(len(geo_object.lats),
                                        len(geo_object.lons)))
        comp_loc['x'][i] = geo_object.lons[coords[1]]
        comp_loc['y'][i] = geo_object.lats[coords[0]]

    total_var = np.sum(np.var(masked_data, axis=0))

    if verbosity > 0:
        print("\ttotal_var = ", total_var)
        print("\tSetting start_end from data_parameters")
    # print time_mask
    # print expvar
    # start_end = (str(date.fromordinal(int(geo_object.tm[0]))),
    #                   str(date.fromordinal(int(geo_object.tm[-1]))))
    start_end = (str(geo_object.start_date), str(geo_object.end_date))

    # Overwrite entries and delete those not needed anyway

    return {
        'weights': weights,
        'ts_unmasked': comps_ts,
        'ts_masked': comps_ts_masked,
        # 'explained_var':sorted_expvar,
        # 'unrotated_weights':V,
        # 'explained': explained,
        # 'pca_eigs':eig,
        'truncate_by': truncate_by,
        'max_comps': max_comps,
        'fraction_explained_variance': fraction_explained_variance,
        'total_var': total_var,
        'month_mask': month_mask,
        'comps_max_loc': comp_loc,
        'time_mask': time_mask,
        'start_end': start_end,
        'time': geo_object.tm,
        'lats': geo_object.lats,
        'lons': geo_object.lons,
    }