Ejemplo n.º 1
0
def getNgramsList(exp=''):
    """
    Load tokens (and bigrams) for the POTUS dataset, the list maintains the same order of speeches as the speeches dataframe in the experiment folder

    Args:
        exp (str, optional): name of experiment. Defaults to ''.

    Returns:
        list: list of tokens for all speeches
    """
    import helpers.io as pickle_io
    root_folder = rootFolder(exp)
    fn = root_folder + '/bigrams.pkl'
    print(f'loading bigrams from {fn}')
    obj = pickle_io.from_pickle(fn)
    print('loaded bigrams.')
    return (obj)
expfolder = args.experiment
num_topics = args.num_topics
iterations = args.iterations
passes = args.passes

print('experiment folder name: ', expfolder)
print('number of topics: ', num_topics)
print('iterations: ', iterations)
print('passes: ', passes)

# %%
root_folder = potus.rootFolder(expfolder)
print('root_folder: ', root_folder)

bigrams = pickle_io.from_pickle(root_folder + '/bigrams.pkl')

# # %%
# this logic is move to the begining of the pipeline before the speeches df is saved to file
# min_speech_len = 10
# bigrams = [b for b in bigrams if len(b) >= min_speech_len]

# %%
speeches_dictionary = process.make_dictionary(bigrams, large_list=True)

# %%
# AJ
# Filter out words that occur less than <no_below> documents, or more than <no_above> x 100% of the documents.
speeches_dictionary.filter_extremes(no_below=10, no_above=0.5)

# %%
# %%
root_folder = potus.rootFolder(expfolder)
print('root_folder: ', root_folder)

# %%
df_fn = root_folder + '/speeches_df.pkl'
df_probs_fn = root_folder + '/speeches_df_topics_probs.pkl'
topic_probs_fn = root_folder + '/topic_probs.pkl'

# %%
print(f'load df from {df_fn}')
speeches_df = pd.read_pickle(df_fn)
# %%
print(f'load topics probabilities from {topic_probs_fn}')
probs = pickle_io.from_pickle(topic_probs_fn)
# %%
print(f'loaded df, nrows: {len(speeches_df)}')
print(f'loaded probs, nrows: {len(probs)}')
# %%
print('add probs to df')
speeches_df['probs'] = probs
#pprint(speeches_df.head())
# %%
print('check that topics probs add up to 1 for each doc')
speeches_df.apply(lambda x: sum(x['probs']), axis='columns')
# %%
speeches_df.sort_values(by=['date'],
                        ascending=True,
                        inplace=True,
                        ignore_index=True)

print('experiment folder name: ', expfolder)

# %%
root_folder = hansard.rootFolder(expfolder)
print('root_folder: ', root_folder)

# %%
lda_model_fn = root_folder + '/lda.model'
print(f'load lda model from {lda_model_fn}')
speeches_lda = ldamodel.LdaModel.load(lda_model_fn)

bow_fn = root_folder + '/bow.pkl'
print(f'load bow from  {bow_fn}')
bow = pickle_io.from_pickle(bow_fn)

gc.collect()
# %% [markdown]
# ## for each document, calculate topics and store topic probabiities
#
# this is then used to calculate KLD between documents by using
#  their topic probabiities as inputs
#

# %%
probs = list()

# topicids = list(), we dont need list of topic ids, they are allways 0 to 99

for i, b in tqdm.tqdm(enumerate(bow), desc='calculating topic probabilities'):
Ejemplo n.º 5
0
def gridExpResult(exp, scales=[25, 125], trim=0):
    """
    Load and return experiment results including: source dataframe, experiment settings, trained LDA model, associated BoW. This function does not perform any calculations, it only returns results produced by running an experiment pipeline

    Args:
        exp (str, optional): name of experiment. Defaults to ''.
        scales (list, optional): KLD window sizes to be included in the results set. Defaults to [25, 125].
        trim (int, optional): number of speeches to trim from each end. Defaults to 0.

    Returns:
        dict: df, settings, model, Bow
    """
    import pandas as pd
    import glob
    import os
    import re
    from gensim.models import ldamodel
    import helpers.io as pickle_io

    exp_folder = rootFolder(exp)
    exp_kld_files = exp_folder + '/' + 'speeches_dfr_*.pkl'

    # load lda model and BoW
    lda_model_fn = exp_folder + '/lda.model'
    print(f'load lda model from {lda_model_fn}')
    speeches_lda = ldamodel.LdaModel.load(lda_model_fn)

    bow_fn = exp_folder + '/bow.pkl'
    print(f'load bow from  {bow_fn}')
    bow = pickle_io.from_pickle(bow_fn)

    # load kld results df
    files = glob.glob(exp_kld_files)

    kld_settings = list()
    df = pd.DataFrame()
    for fn in files:
        print(fn)
        m = re.findall(r'(\d+)', os.path.basename(fn))
        topics, iterations, passes, Nw, Tw = m

        if int(Nw) in scales:
            print(f't: {topics}, Nw: {Nw}, Tw:{Tw}')
            kld_settings.append({
                'kld_filename': os.path.basename(fn),
                'topics': int(topics),
                'Nw': int(Nw),
                'Tw': int(Tw),
                'path': fn
            })

            dft = pd.read_pickle(fn)
            dft['kld_filename'] = os.path.basename(fn)
            print(len(dft), ' before trim')

            dft['speech_id'] = dft.apply(lambda r: f'{r.speaker}_{r.name:03d}',
                                         axis=1)

            if trim > 0:
                df = df.append(dft[trim:-trim])
            else:
                df = df.append(dft)

    class objectview(object):
        def __init__(self, d):
            self.__dict__ = d

    df.sort_values(by='date', ascending=True, inplace=True)
    df = pd.merge(df,
                  pd.DataFrame(kld_settings),
                  how='left',
                  on='kld_filename',
                  indicator=True)

    # combine all results in return object
    ret = objectview({
        'df': df,
        'settings': pd.DataFrame(kld_settings),
        'model': speeches_lda,
        'bow': bow
    })

    return (ret)
    'name',
    'name_id',
    'party',
    'in_gov',
    'electorate',
    'first_speech',
    'context',
    'context_title',
    'context_type',
    'speech_type',  # AJ
    'text'
]
column_names_no_text = column_names[:-1]

# %%
speeches_list = pickle_io.from_pickle(fn)

# %%
speeches_df = pd.DataFrame(speeches_list, columns=column_names)

# exclude speeches by the chair - AJ
speeches_df = speeches_df[~speeches_df['name'].str.contains('SPEAKER')
                          & ~speeches_df['name'].str.contains('PRESIDENT')]

size_before = speeches_df.memory_usage(deep=True).sum() / 1e+9

speeches_df.chamber = pd.Categorical(speeches_df.chamber)
speeches_df.parliament = pd.Categorical(speeches_df.parliament.astype(int),
                                        ordered=True)
speeches_df.date_time = pd.to_datetime(speeches_df.date_time, unit='D')
speeches_df.session = pd.Categorical(speeches_df.session.astype(int),
Ejemplo n.º 7
0
expfolder = args.experiment
num_topics = args.num_topics
iterations = args.iterations
passes = args.passes

print('experiment folder name: ', expfolder)
print('number of topics: ', num_topics)
print('iterations: ', iterations)
print('passes: ', passes)

# %%
root_folder = hansard.rootFolder(expfolder)
print('root_folder: ', root_folder)

bigrams = pickle_io.from_pickle(root_folder + '/bigrams.pkl')

# # %%
# this logic is move to the begining of the pipeline before the speeches df is saved to file
# min_speech_len = 10
# bigrams = [b for b in bigrams if len(b) >= min_speech_len]

# %%
speeches_dictionary = process.make_dictionary(bigrams, large_list=True)

# %%
# AJ
# Filter out words that occur less than <no_below> documents, or more than <no_above> x 100% of the documents.
speeches_dictionary.filter_extremes(no_below=200, no_above=0.5)

# %%