コード例 #1
0
def test_input():

    # frequency signature notation
    df = read_csv("tests/ucs-gold-100.ds", comment='#', index_col=0,
                  sep="\t", quoting=3, keep_default_na=False)
    df.rename({'l2': 'item'}, axis=1, inplace=True)
    df = df[['item', 'f', 'f1', 'f2', 'N']]
    df.index.name = 'id'
    print()
    print(df.head())

    # keywords
    tmp = df[['item', 'f', 'f1']].rename({'f': 'f1', 'f1': 'N1'}, axis=1)
    tmp['f2'] = df['f2'] - df['f']
    tmp['N2'] = df['N'] - df['f1']
    print(tmp.head())

    # contingency notation
    obs = fq.observed_frequencies(df)
    print()
    print(obs[['O11', 'O12', 'O21', 'O22']].head())

    # expected frequencies
    exp = fq.expected_frequencies(df)
    print()
    print(exp[['E11', 'E12', 'E21', 'E22']].head())

    print()
    print(df.head())
    obs = fq.observed_frequencies(df)
    print()
    print(obs[['O11', 'O12', 'O21', 'O22']].head())
コード例 #2
0
def test_binomial_likelihood_brown_overflow(brown_dataframe):
    df = brown_dataframe
    df = df.join(fq.observed_frequencies(df), rsuffix='_')
    df = df.join(fq.expected_frequencies(df), rsuffix='_')
    df = df.head(1000)
    df['binomial_likelihood'] = am.binomial_likelihood(df)
    assert df['binomial_likelihood'].isnull().any()
コード例 #3
0
def test_binomial_likelihood_brown(brown_dataframe):
    df = brown_dataframe
    df = df.join(fq.observed_frequencies(df), rsuffix='_')
    df = df.join(fq.expected_frequencies(df), rsuffix='_')
    df = df.head(100)
    df['binomial_likelihood'] = am.binomial_likelihood(df)
    assert df['binomial_likelihood'][0] == 0.00810143610212444
コード例 #4
0
def fixed_dataframe():
    """ Sample DataFrame with fixed data"""
    df = pd.DataFrame({
        'f': list(reversed(range(1, 11))),
        'f1': [10] * 10,
        'f2': list(range(10, 30, 2)),
        'N': [100] * 10
    })
    df = df.join(fq.observed_frequencies(df))
    df = df.join(fq.expected_frequencies(df))
    return df
コード例 #5
0
def random_dataframe():
    """ Sample DataFrame with random data

    invalid if f > f1
    """
    df = pd.DataFrame({
        'f': np.random.randint(10, size=10),
        'f1': np.random.randint(10, size=10),
        'f2': np.random.randint(10, size=10),
        'N': [10] * 10
    })

    df = df.join(fq.observed_frequencies(df))
    df = df.join(fq.expected_frequencies(df))
    return df
コード例 #6
0
def test_ucs(ucs_dataframe):

    df = ucs_dataframe

    # ucs data has the following relevant columns
    # f = O11
    # f1 = R1
    # f2 = C1
    # N

    # get observed frequencies
    df['O11'] = df['f']
    df['O21'] = df['f2'] - df['O11']
    df['O12'] = df['f1'] - df['O11']
    df['O22'] = df['N'] - df['f1'] - df['O21']

    # check observed frequencies
    obs = fq.observed_frequencies(df)
    assert(obs['O11'].equals(df['O11']))
    assert(obs['O12'].equals(df['O12']))
    assert(obs['O21'].equals(df['O21']))
    assert(obs['O22'].equals(df['O22']))

    # check marginals
    R1 = df['O11'] + df['O12']
    R2 = df['O21'] + df['O22']
    C1 = df['O11'] + df['O21']
    C2 = df['O12'] + df['O22']
    assert((R1 + R2).equals(df['N']))
    assert((C1 + C2).equals(df['N']))

    # get expected frequencies
    df['E11'] = R1 * C1 / df['N']
    df['E12'] = R1 * C2 / df['N']
    df['E21'] = R2 * C1 / df['N']
    df['E22'] = R2 * C2 / df['N']

    # check expected frequencies
    exp = fq.expected_frequencies(df)
    assert(exp['E11'].equals(df['E11']))
    assert(exp['E12'].equals(df['E12']))
    assert(exp['E21'].equals(df['E21']))
    assert(exp['E22'].equals(df['E22']))
コード例 #7
0
ファイル: collocates.py プロジェクト: dokempf/cwb-ccc
def add_ams(f, f1, f2, N,
            min_freq=2,
            order='f',
            cut_off=100,
            flags=None,
            ams=None,
            frequencies=True):
    """ create a table of co-occurrence counts and association measures.
    for frequency signature notation see Evert (2004: 36)

    :param DataFrame f: co-occurrence freq. of token and node
    :param int f1: number of tokens in W(node)
    :param DataFrame f2: marginal freq. of tokens
    :param int N: size of corpus

    :param int min_freq: minimum number of co-occurrences for item to be included
    :param str order: 'f' / 'f2' / assoc-measure
    :param int cut_off: number of collocates to retrieve
    :param str flags: '%c' / '%d' / '%cd'
    :param list ams: assoc-measures to calculate (None=all)
    :param bool frequencies: add raw frequencies to result?

    :return: table of counts and measures, indexed by item
    :rtype: DataFrame

    """

    logger.info('creating table of association measures')

    # drop items that occur less than min-freq
    f = f.loc[~(f['f'] < min_freq)]

    # init contingency table with f and f2
    contingencies = f.join(f2)
    # post-processing: fold items
    contingencies = fold_df(contingencies, flags)
    # add constant columns
    contingencies['N'] = N
    contingencies['f1'] = f1

    # add measures
    measures = calculate_measures(contingencies, ams)
    contingencies = contingencies.join(measures)

    # create output
    if frequencies:
        contingencies = contingencies.join(
            fq.observed_frequencies(contingencies)
        )
        contingencies = contingencies.join(
            fq.expected_frequencies(contingencies)
        )

    # sort dataframe
    contingencies = contingencies.sort_values(
        by=[order, 'f'], ascending=False
    )

    # apply cut-off
    if cut_off is not None:
        contingencies = contingencies.head(cut_off)

    return contingencies
コード例 #8
0
def calculate_keywords(df1, df2, C1, C2, lonely=True, how='first'):

    if how == 'second':
        return calculate_keywords(df2, df1, lonely=lonely, how='first')

    # only in one list
    if lonely:
        items1 = set(df1.index)
        items2 = set(df2.index)

        only1 = df1.loc[items1 - items2]
        only1['ppm'] = round(only1['O11'] / C1 * 1000000, 2)

        # only2 = df2.loc[items2 - items1]
        # only2['ppm'] = round(only2['O12'] / C2 * 1000000, 2)

        # df_only = concat([only1, only2])
        # df_only.fillna(0, inplace=True)
        # df_only['diff'] = df_only['O11'] - df_only['O12']
        # df_only.sort_values(by=['diff', 'item'], ascending=False, inplace=True)

        df_only = only1
        df_only['freq_1'] = to_numeric(df_only['O11'], downcast='integer')
        # df_only['freq_2'] = to_numeric(df_only['O12'], downcast='integer')
        # df_only = df_only[['freq_1', 'ppm', 'freq_2', 'item']]

        # reset index and sort by frequency
        df_only.index.name = 'item'
        df_only = df_only.reset_index()
        df_only = df_only.sort_values(by=['freq_1', 'item'], ascending=False)

        # use index for ranking
        df_only = df_only.reset_index()
        df_only.index.name = 'rank'
        df_only.index = df_only.index + 1

        # reduce to relevant columns
        df_only = df_only[['item', 'freq_1', 'ppm']]

    else:
        df_only = None

    # join dataframes
    df = df1.join(df2, how='inner')
    df.fillna(0, inplace=True)

    df["O21"] = C1 - df["O11"]
    df["O22"] = C2 - df["O12"]

    # some more names
    df["f1"] = df["O11"] + df["O12"]  # overall counts of word
    df["f2"] = df["O11"] + df["O21"]  # size of corpus 1
    df["N"] = C1 + C2

    df = df.join(frequencies.expected_frequencies(df))

    # ppm and comparision
    df['ppm_1'] = round(df['O11'] / C1 * 1000000, 2)
    df['ppm_1_e'] = round(df['E11'] / C1 * 1000000, 2)
    df['ppm_2'] = round(df['O12'] / C2 * 1000000, 2)
    df['ppm_2_e'] = round(df['E12'] / C2 * 1000000, 2)

    # calculate association measures
    df = df.join(measures.calculate_measures(df))
    df['log_likelihood'] = round(df['log_likelihood'], 2)
    df['log_ratio'] = df.apply(logratio, axis=1)
    df.index.name = 'item'

    return df, df_only
コード例 #9
0
def test_expected_frequencies(fixed_dataframe):

    df = fq.expected_frequencies(fixed_dataframe)

    assert df['E11'][0] == 1.0
コード例 #10
0
def test_binomial_likelihood_zero(zero_dataframe):
    df = zero_dataframe
    df = df.join(fq.observed_frequencies(df), rsuffix='_')
    df = df.join(fq.expected_frequencies(df), rsuffix='_')
    ams = am.binomial_likelihood(df)
    assert isnan(ams[0])