コード例 #1
0
ファイル: similarity.py プロジェクト: juanshishido/okcupid
def _levels_dict(demographics):
    """Create a dictionary of indices and
    corresponding demographic levels

    Parameters
    ----------
    demographics : pd.Series
        Demographic labels

    Returns
    -------
    levels_dict : dict
    """
    levels = _levels(demographics)
    levels_dict = {k : v for k, v in enumerate(levels)}
    return levels_dict
コード例 #2
0
def _arrs_pos(df_orig,
              df_pos,
              demographic,
              pos,
              d_levels=None,
              print_levels=False):
    """Individual part-of-speech
    arrays for a particular demographic
    
    Parameters
    ----------
    df_orig : pd.DataFrame
        The DataFrame from which `df_pos` was created
    df_pos : pd.DataFrame
        The part-of-speech DataFrame
    demographic : str
        A valid demographic-data column in `df_orig`
    pos : str
        A column in `df_pos` corresponding
        to a part of speech
    d_levels : list, default None
        The specific demographic levels desired
    print_levels : bool, default False
        Whether to print the demographic levels
    
    Returns
    -------
    arrs : tuple of np.arrays
        The corresponding `pos` values for each `demographic`
    """
    df_pos = df_pos.copy()  # so we don't modify it
    df_pos[demographic] = df_orig[demographic].values
    levels = _levels(df_orig[demographic], d_levels, print_levels)
    arrs = []
    for d in levels:
        arr = df_pos[df_pos[demographic] == d][pos].values
        n = arr.shape[0]
        if n < 0.1 * df_pos.shape[0]:
            print("Warning: '" + d +
                  "' category has less than 10% of observations (" + str(n) +
                  ")")
        arrs.append(arr)
    return tuple(arrs)
コード例 #3
0
def _arrs_pos(df_orig, df_pos, demographic, pos,
              d_levels=None, print_levels=False):
    """Individual part-of-speech
    arrays for a particular demographic
    
    Parameters
    ----------
    df_orig : pd.DataFrame
        The DataFrame from which `df_pos` was created
    df_pos : pd.DataFrame
        The part-of-speech DataFrame
    demographic : str
        A valid demographic-data column in `df_orig`
    pos : str
        A column in `df_pos` corresponding
        to a part of speech
    d_levels : list, default None
        The specific demographic levels desired
    print_levels : bool, default False
        Whether to print the demographic levels
    
    Returns
    -------
    arrs : tuple of np.arrays
        The corresponding `pos` values for each `demographic`
    """
    df_pos = df_pos.copy() # so we don't modify it
    df_pos[demographic] = df_orig[demographic].values
    levels = _levels(df_orig[demographic], d_levels, print_levels)
    arrs = []
    for d in levels:
        arr = df_pos[df_pos[demographic] == d][pos].values
        n = arr.shape[0]
        if n < 0.1 * df_pos.shape[0]:
            print("Warning: '" + d +
                  "' category has less than 10% of observations (" +
                  str(n) + ")")
        arrs.append(arr)
    return tuple(arrs)