Ejemplo n.º 1
0
def spacetime2(reg, sReg, df, ko, omega_age_smooth, lambda_time_smooth,
               lambda_time_smooth_nodata, zeta_space_smooth, zeta_space_smooth_nodata):
    """
    Compute the spacetime weight matrix for a super region. Full data set tells
    which values need weights, train data set are the residuals which need
    weighting.
    """
    full_sub = df[(df.region == reg)]
    train_sub = df[(df.super_region == sReg) & (ko)]
    year_start = np.min(df.year)
    year_end = np.max(df.year)
    Wat = timeW(full_sub, train_sub, omega_age_smooth, lambda_time_smooth,
        lambda_time_smooth_nodata, year_start, year_end).astype("float32")
    NR, SN, C, R, SR = matCRS(full_sub, train_sub)
    xi_mat = calculate_xi_matrix(full_sub, train_sub, zeta_space_smooth,
                                 zeta_space_smooth_nodata).astype("float32")
    NR = weight_matrix(NR, xi_mat[:,0], Wat).astype("float32")
    SN = weight_matrix(SN, xi_mat[:,1], Wat).astype("float32")
    C = weight_matrix(C, xi_mat[:,2], Wat).astype("float32")
    R = weight_matrix(R, xi_mat[:,3], Wat).astype("float32")
    SR = weight_matrix(SR, xi_mat[:,4], Wat).astype("float32")
    final = EV("NR + SN + C + R + SR").astype("float32")
    del NR, SN, C, R, SR
    account_missing = final.sum(0)
    account_missing[account_missing == .0] = 1.
    return EV("final / account_missing").astype("float32")
Ejemplo n.º 2
0
def timeW(full_sub, train_sub, omega_age_smooth, lambda_time_smooth,
          lambda_time_smooth_nodata):
    '''
    Gets the time age weight of a superregion given a full and training data
    set. Returns a matrix of size equal to the row size of the traing data set
    by the row size of the full data set each subsetted by the super region.
    Each cell represents the age by time weight for each observation
    (the column) for each residual (the row).
    '''
    ageS = makeS(full_sub, train_sub,
                 "ageC").astype("float32")  # make stride of age values
    yearS = makeS(full_sub, train_sub,
                  "year").astype("float32")  # stride of year values
    l = calculate_lamda_array(full_sub, train_sub, lambda_time_smooth,
                              lambda_time_smooth_nodata).astype(
                                  "float32")  # assign lambda
    i1 = full_sub.ageC.values.astype("float32")  # age vector
    i2 = full_sub.year.values.astype("float32")  # year vector
    start = full_sub.year.min()
    end = full_sub.year.max()
    aMax = np.maximum(EV("abs(i2-start)"),
                      EV("abs(end-i2)")).astype("float32")  # argMax vector
    return EV(
        "(1/exp(omega_age_smooth*abs(ageS-i1))) * (1 - (abs(yearS-i2)/(aMax+1))**l)**3"
    )
Ejemplo n.º 3
0
def matCRS(full_sub, train_sub):
    """
    For a designated super region returns 3 matrices where each column is an
    observation from the full data frame and each row is a residual from
    the training data set. The three matrices have either values of 1 or 0
    and designate whether the residual and the observation are in the same
    country(C), same region but not same country(R), and same super_region but
    not the same region or country(S).
    """
    sub_nat_S = makeS(full_sub, train_sub, "location_id")
    country_S = makeS(full_sub, train_sub, "country_id")
    region_S = makeS(full_sub, train_sub, "region")
    sub_nat_V = full_sub.location_id.values
    country_V = full_sub.country_id.values
    region_V = full_sub.region.values
    has_sub_nat_V = (train_sub.country_id !=
                     train_sub.location_id).values.astype(np.int8)
    not_representitive_V = (0**train_sub.national.values).astype(np.int8)
    SN = EV("sub_nat_S == sub_nat_V").astype(np.int8).T
    NR = EV("SN * not_representitive_V").T
    SN = EV("SN * has_sub_nat_V").T
    C = EV("country_S == country_V").astype(np.int8)
    C = EV("C - SN")
    SN = EV("SN * 0**NR")
    C = EV("C * 0**NR")
    R = EV("region_S == region_V").astype(np.int8)
    R = EV("R - C - SN - NR")
    SR = EV("1 - R - C - SN - NR")
    return NR, SN, C, R, SR
Ejemplo n.º 4
0
def weight_matrix(valid_positions, xi_vector, weight_matrix):
    """
    (matrix, vector, matrix)

    Given a matrix of valid positions for an analytic region (valid_positions),
    a vector of appropriate xi weights to use for each column in that vector
    (xi_vector), and an age year weighted matrix generated by timeW will return
    a matrix re-weighted so that each column adds up to the corresponding xi
    value in the xi_vector.
    """
    weights = EV("valid_positions * weight_matrix")
    sum_of_weights = weights.sum(0)
    sum_of_weights[sum_of_weights == .0] = 1.
    return EV("(weights / sum_of_weights) * xi_vector")
Ejemplo n.º 5
0
def calculate_xi_matrix(full, train, zeta_space_smooth,
                        zeta_space_smooth_nodata):
    '''
    (data frame, data frame, float, float) -> array

    Given two data frames ("full", "train") where train is a subset of full used
    to train a model and two float values for possible use of xi value
    ("zeta_space_smooth", "zeta_space_smooth_nodata") returns a matrix of xi values with a number
    of rows equal to the number of observations in the training data and 5
    columns. Each cell is given a value depending on the weighting that should
    be used for each observation in the full set in comparison to training
    observations if they share the same sub_national, country, region or
    super region for columns 2 through 5. The first column is the weight of
    data that is to the most specific level for that observation but not
    representative. Each row sum should add up to 1.
    '''
    depths = location_depth(full, train)

    def f(x):
        return calc_xi_vec(x, zeta_space_smooth, zeta_space_smooth_nodata)

    base = np.array(map(f, depths))
    train_copy = train.set_index("location_id")
    non_rep_loc = train[train.national != 1].location_id.unique()
    non_rep = full.location_id.map(lambda x: x in non_rep_loc)
    non_rep_vec = EV(
        "zeta_space_smooth * (non_rep - non_rep * zeta_space_smooth)")
    # keep track of the place that only have no rep data so we can give them the full location weight
    only_non_rep_loc = np.setdiff1d(
        non_rep_loc, train[train.national == 1].location_id.unique())
    only_non_rep = full.location_id.map(lambda x: x in only_non_rep_loc)
    modify_SN = non_rep_vec * (base[:, 0] != 0).astype(int)
    modify_C = non_rep_vec * (base[:, 0] == 0).astype(int)
    base[:, 0] = base[:, 0] - modify_SN
    base[:, 1] = base[:, 1] - modify_C
    base = np.append(non_rep_vec.reshape(len(base), 1), base, 1)
    base[only_non_rep.values & (depths == 3), 0] = \
        base[only_non_rep.values & (depths == 3), :][:, [0, 2]].sum(axis=1)
    base[only_non_rep.values & (depths == 3), 2] = 0
    base[only_non_rep.values & (depths == 4), 0] = \
        base[only_non_rep.values & (depths == 4), :][:, [0, 1]].sum(axis=1)
    base[only_non_rep.values & (depths == 4), 1] = 0
    return base