def test_input(): # frequency signature notation df = read_csv("tests/ucs-gold-100.ds", comment='#', index_col=0, sep="\t", quoting=3, keep_default_na=False) df.rename({'l2': 'item'}, axis=1, inplace=True) df = df[['item', 'f', 'f1', 'f2', 'N']] df.index.name = 'id' print() print(df.head()) # keywords tmp = df[['item', 'f', 'f1']].rename({'f': 'f1', 'f1': 'N1'}, axis=1) tmp['f2'] = df['f2'] - df['f'] tmp['N2'] = df['N'] - df['f1'] print(tmp.head()) # contingency notation obs = fq.observed_frequencies(df) print() print(obs[['O11', 'O12', 'O21', 'O22']].head()) # expected frequencies exp = fq.expected_frequencies(df) print() print(exp[['E11', 'E12', 'E21', 'E22']].head()) print() print(df.head()) obs = fq.observed_frequencies(df) print() print(obs[['O11', 'O12', 'O21', 'O22']].head())
def test_binomial_likelihood_brown_overflow(brown_dataframe): df = brown_dataframe df = df.join(fq.observed_frequencies(df), rsuffix='_') df = df.join(fq.expected_frequencies(df), rsuffix='_') df = df.head(1000) df['binomial_likelihood'] = am.binomial_likelihood(df) assert df['binomial_likelihood'].isnull().any()
def test_binomial_likelihood_brown(brown_dataframe): df = brown_dataframe df = df.join(fq.observed_frequencies(df), rsuffix='_') df = df.join(fq.expected_frequencies(df), rsuffix='_') df = df.head(100) df['binomial_likelihood'] = am.binomial_likelihood(df) assert df['binomial_likelihood'][0] == 0.00810143610212444
def fixed_dataframe(): """ Sample DataFrame with fixed data""" df = pd.DataFrame({ 'f': list(reversed(range(1, 11))), 'f1': [10] * 10, 'f2': list(range(10, 30, 2)), 'N': [100] * 10 }) df = df.join(fq.observed_frequencies(df)) df = df.join(fq.expected_frequencies(df)) return df
def random_dataframe(): """ Sample DataFrame with random data invalid if f > f1 """ df = pd.DataFrame({ 'f': np.random.randint(10, size=10), 'f1': np.random.randint(10, size=10), 'f2': np.random.randint(10, size=10), 'N': [10] * 10 }) df = df.join(fq.observed_frequencies(df)) df = df.join(fq.expected_frequencies(df)) return df
def test_ucs(ucs_dataframe): df = ucs_dataframe # ucs data has the following relevant columns # f = O11 # f1 = R1 # f2 = C1 # N # get observed frequencies df['O11'] = df['f'] df['O21'] = df['f2'] - df['O11'] df['O12'] = df['f1'] - df['O11'] df['O22'] = df['N'] - df['f1'] - df['O21'] # check observed frequencies obs = fq.observed_frequencies(df) assert(obs['O11'].equals(df['O11'])) assert(obs['O12'].equals(df['O12'])) assert(obs['O21'].equals(df['O21'])) assert(obs['O22'].equals(df['O22'])) # check marginals R1 = df['O11'] + df['O12'] R2 = df['O21'] + df['O22'] C1 = df['O11'] + df['O21'] C2 = df['O12'] + df['O22'] assert((R1 + R2).equals(df['N'])) assert((C1 + C2).equals(df['N'])) # get expected frequencies df['E11'] = R1 * C1 / df['N'] df['E12'] = R1 * C2 / df['N'] df['E21'] = R2 * C1 / df['N'] df['E22'] = R2 * C2 / df['N'] # check expected frequencies exp = fq.expected_frequencies(df) assert(exp['E11'].equals(df['E11'])) assert(exp['E12'].equals(df['E12'])) assert(exp['E21'].equals(df['E21'])) assert(exp['E22'].equals(df['E22']))
def add_ams(f, f1, f2, N, min_freq=2, order='f', cut_off=100, flags=None, ams=None, frequencies=True): """ create a table of co-occurrence counts and association measures. for frequency signature notation see Evert (2004: 36) :param DataFrame f: co-occurrence freq. of token and node :param int f1: number of tokens in W(node) :param DataFrame f2: marginal freq. of tokens :param int N: size of corpus :param int min_freq: minimum number of co-occurrences for item to be included :param str order: 'f' / 'f2' / assoc-measure :param int cut_off: number of collocates to retrieve :param str flags: '%c' / '%d' / '%cd' :param list ams: assoc-measures to calculate (None=all) :param bool frequencies: add raw frequencies to result? :return: table of counts and measures, indexed by item :rtype: DataFrame """ logger.info('creating table of association measures') # drop items that occur less than min-freq f = f.loc[~(f['f'] < min_freq)] # init contingency table with f and f2 contingencies = f.join(f2) # post-processing: fold items contingencies = fold_df(contingencies, flags) # add constant columns contingencies['N'] = N contingencies['f1'] = f1 # add measures measures = calculate_measures(contingencies, ams) contingencies = contingencies.join(measures) # create output if frequencies: contingencies = contingencies.join( fq.observed_frequencies(contingencies) ) contingencies = contingencies.join( fq.expected_frequencies(contingencies) ) # sort dataframe contingencies = contingencies.sort_values( by=[order, 'f'], ascending=False ) # apply cut-off if cut_off is not None: contingencies = contingencies.head(cut_off) return contingencies
def calculate_keywords(df1, df2, C1, C2, lonely=True, how='first'): if how == 'second': return calculate_keywords(df2, df1, lonely=lonely, how='first') # only in one list if lonely: items1 = set(df1.index) items2 = set(df2.index) only1 = df1.loc[items1 - items2] only1['ppm'] = round(only1['O11'] / C1 * 1000000, 2) # only2 = df2.loc[items2 - items1] # only2['ppm'] = round(only2['O12'] / C2 * 1000000, 2) # df_only = concat([only1, only2]) # df_only.fillna(0, inplace=True) # df_only['diff'] = df_only['O11'] - df_only['O12'] # df_only.sort_values(by=['diff', 'item'], ascending=False, inplace=True) df_only = only1 df_only['freq_1'] = to_numeric(df_only['O11'], downcast='integer') # df_only['freq_2'] = to_numeric(df_only['O12'], downcast='integer') # df_only = df_only[['freq_1', 'ppm', 'freq_2', 'item']] # reset index and sort by frequency df_only.index.name = 'item' df_only = df_only.reset_index() df_only = df_only.sort_values(by=['freq_1', 'item'], ascending=False) # use index for ranking df_only = df_only.reset_index() df_only.index.name = 'rank' df_only.index = df_only.index + 1 # reduce to relevant columns df_only = df_only[['item', 'freq_1', 'ppm']] else: df_only = None # join dataframes df = df1.join(df2, how='inner') df.fillna(0, inplace=True) df["O21"] = C1 - df["O11"] df["O22"] = C2 - df["O12"] # some more names df["f1"] = df["O11"] + df["O12"] # overall counts of word df["f2"] = df["O11"] + df["O21"] # size of corpus 1 df["N"] = C1 + C2 df = df.join(frequencies.expected_frequencies(df)) # ppm and comparision df['ppm_1'] = round(df['O11'] / C1 * 1000000, 2) df['ppm_1_e'] = round(df['E11'] / C1 * 1000000, 2) df['ppm_2'] = round(df['O12'] / C2 * 1000000, 2) df['ppm_2_e'] = round(df['E12'] / C2 * 1000000, 2) # calculate association measures df = df.join(measures.calculate_measures(df)) df['log_likelihood'] = round(df['log_likelihood'], 2) df['log_ratio'] = df.apply(logratio, axis=1) df.index.name = 'item' return df, df_only
def test_expected_frequencies(fixed_dataframe): df = fq.expected_frequencies(fixed_dataframe) assert df['E11'][0] == 1.0
def test_binomial_likelihood_zero(zero_dataframe): df = zero_dataframe df = df.join(fq.observed_frequencies(df), rsuffix='_') df = df.join(fq.expected_frequencies(df), rsuffix='_') ams = am.binomial_likelihood(df) assert isnan(ams[0])