def getNgramsList(exp=''): """ Load tokens (and bigrams) for the POTUS dataset, the list maintains the same order of speeches as the speeches dataframe in the experiment folder Args: exp (str, optional): name of experiment. Defaults to ''. Returns: list: list of tokens for all speeches """ import helpers.io as pickle_io root_folder = rootFolder(exp) fn = root_folder + '/bigrams.pkl' print(f'loading bigrams from {fn}') obj = pickle_io.from_pickle(fn) print('loaded bigrams.') return (obj)
expfolder = args.experiment num_topics = args.num_topics iterations = args.iterations passes = args.passes print('experiment folder name: ', expfolder) print('number of topics: ', num_topics) print('iterations: ', iterations) print('passes: ', passes) # %% root_folder = potus.rootFolder(expfolder) print('root_folder: ', root_folder) bigrams = pickle_io.from_pickle(root_folder + '/bigrams.pkl') # # %% # this logic is move to the begining of the pipeline before the speeches df is saved to file # min_speech_len = 10 # bigrams = [b for b in bigrams if len(b) >= min_speech_len] # %% speeches_dictionary = process.make_dictionary(bigrams, large_list=True) # %% # AJ # Filter out words that occur less than <no_below> documents, or more than <no_above> x 100% of the documents. speeches_dictionary.filter_extremes(no_below=10, no_above=0.5) # %%
# %% root_folder = potus.rootFolder(expfolder) print('root_folder: ', root_folder) # %% df_fn = root_folder + '/speeches_df.pkl' df_probs_fn = root_folder + '/speeches_df_topics_probs.pkl' topic_probs_fn = root_folder + '/topic_probs.pkl' # %% print(f'load df from {df_fn}') speeches_df = pd.read_pickle(df_fn) # %% print(f'load topics probabilities from {topic_probs_fn}') probs = pickle_io.from_pickle(topic_probs_fn) # %% print(f'loaded df, nrows: {len(speeches_df)}') print(f'loaded probs, nrows: {len(probs)}') # %% print('add probs to df') speeches_df['probs'] = probs #pprint(speeches_df.head()) # %% print('check that topics probs add up to 1 for each doc') speeches_df.apply(lambda x: sum(x['probs']), axis='columns') # %% speeches_df.sort_values(by=['date'], ascending=True, inplace=True, ignore_index=True)
print('experiment folder name: ', expfolder) # %% root_folder = hansard.rootFolder(expfolder) print('root_folder: ', root_folder) # %% lda_model_fn = root_folder + '/lda.model' print(f'load lda model from {lda_model_fn}') speeches_lda = ldamodel.LdaModel.load(lda_model_fn) bow_fn = root_folder + '/bow.pkl' print(f'load bow from {bow_fn}') bow = pickle_io.from_pickle(bow_fn) gc.collect() # %% [markdown] # ## for each document, calculate topics and store topic probabiities # # this is then used to calculate KLD between documents by using # their topic probabiities as inputs # # %% probs = list() # topicids = list(), we dont need list of topic ids, they are allways 0 to 99 for i, b in tqdm.tqdm(enumerate(bow), desc='calculating topic probabilities'):
def gridExpResult(exp, scales=[25, 125], trim=0): """ Load and return experiment results including: source dataframe, experiment settings, trained LDA model, associated BoW. This function does not perform any calculations, it only returns results produced by running an experiment pipeline Args: exp (str, optional): name of experiment. Defaults to ''. scales (list, optional): KLD window sizes to be included in the results set. Defaults to [25, 125]. trim (int, optional): number of speeches to trim from each end. Defaults to 0. Returns: dict: df, settings, model, Bow """ import pandas as pd import glob import os import re from gensim.models import ldamodel import helpers.io as pickle_io exp_folder = rootFolder(exp) exp_kld_files = exp_folder + '/' + 'speeches_dfr_*.pkl' # load lda model and BoW lda_model_fn = exp_folder + '/lda.model' print(f'load lda model from {lda_model_fn}') speeches_lda = ldamodel.LdaModel.load(lda_model_fn) bow_fn = exp_folder + '/bow.pkl' print(f'load bow from {bow_fn}') bow = pickle_io.from_pickle(bow_fn) # load kld results df files = glob.glob(exp_kld_files) kld_settings = list() df = pd.DataFrame() for fn in files: print(fn) m = re.findall(r'(\d+)', os.path.basename(fn)) topics, iterations, passes, Nw, Tw = m if int(Nw) in scales: print(f't: {topics}, Nw: {Nw}, Tw:{Tw}') kld_settings.append({ 'kld_filename': os.path.basename(fn), 'topics': int(topics), 'Nw': int(Nw), 'Tw': int(Tw), 'path': fn }) dft = pd.read_pickle(fn) dft['kld_filename'] = os.path.basename(fn) print(len(dft), ' before trim') dft['speech_id'] = dft.apply(lambda r: f'{r.speaker}_{r.name:03d}', axis=1) if trim > 0: df = df.append(dft[trim:-trim]) else: df = df.append(dft) class objectview(object): def __init__(self, d): self.__dict__ = d df.sort_values(by='date', ascending=True, inplace=True) df = pd.merge(df, pd.DataFrame(kld_settings), how='left', on='kld_filename', indicator=True) # combine all results in return object ret = objectview({ 'df': df, 'settings': pd.DataFrame(kld_settings), 'model': speeches_lda, 'bow': bow }) return (ret)
'name', 'name_id', 'party', 'in_gov', 'electorate', 'first_speech', 'context', 'context_title', 'context_type', 'speech_type', # AJ 'text' ] column_names_no_text = column_names[:-1] # %% speeches_list = pickle_io.from_pickle(fn) # %% speeches_df = pd.DataFrame(speeches_list, columns=column_names) # exclude speeches by the chair - AJ speeches_df = speeches_df[~speeches_df['name'].str.contains('SPEAKER') & ~speeches_df['name'].str.contains('PRESIDENT')] size_before = speeches_df.memory_usage(deep=True).sum() / 1e+9 speeches_df.chamber = pd.Categorical(speeches_df.chamber) speeches_df.parliament = pd.Categorical(speeches_df.parliament.astype(int), ordered=True) speeches_df.date_time = pd.to_datetime(speeches_df.date_time, unit='D') speeches_df.session = pd.Categorical(speeches_df.session.astype(int),
expfolder = args.experiment num_topics = args.num_topics iterations = args.iterations passes = args.passes print('experiment folder name: ', expfolder) print('number of topics: ', num_topics) print('iterations: ', iterations) print('passes: ', passes) # %% root_folder = hansard.rootFolder(expfolder) print('root_folder: ', root_folder) bigrams = pickle_io.from_pickle(root_folder + '/bigrams.pkl') # # %% # this logic is move to the begining of the pipeline before the speeches df is saved to file # min_speech_len = 10 # bigrams = [b for b in bigrams if len(b) >= min_speech_len] # %% speeches_dictionary = process.make_dictionary(bigrams, large_list=True) # %% # AJ # Filter out words that occur less than <no_below> documents, or more than <no_above> x 100% of the documents. speeches_dictionary.filter_extremes(no_below=200, no_above=0.5) # %%