term1topN_tf = term1topN_tf[term1topN_tf['Speaker Party'].isin( parties4)] dfbypartyspeaker = dfbypartyspeaker[ dfbypartyspeaker['Speaker Party'].isin(parties4)] elif N == 2: term1topN_tf = term1topN_tf[term1topN_tf['Speaker Party'].isin( parties2)] dfbypartyspeaker = dfbypartyspeaker[ dfbypartyspeaker['Speaker Party'].isin(parties2)] # %% term1_topN_bySpeakerParty, topN = m.select_phrases_from_df2( dfbypartyspeaker, term1topN_tf, ['Speaker Party', 'Speaker']) # else: # term1_topN_bySpeakerParty, topN = m.select_phrases_from_df2(dfbypartyspeaker,term1topN_tf,['Speaker Party','Speaker']) # %% tfidf top 500 term1_topN_bySpeakerParty_scaled = m.make_share(term1_topN_bySpeakerParty) term1_topN_bySpeakerParty_share = m.make_share(term1_topN_bySpeakerParty, scale=False) # %% save result print(sys.argv[l_in * 2 + 4 + (i - 1) * 5]) # print(sys.argv) term1_topN_bySpeakerParty.to_csv(sys.argv[l_in * 2 + 4 + (i - 1) * 5]) term1_topN_bySpeakerParty_scaled.to_csv(sys.argv[l_in * 2 + 4 + 1 + (i - 1) * 5]) term1_topN_bySpeakerParty_share.to_csv(sys.argv[l_in * 2 + 4 + 2 + (i - 1) * 5]) # %% save interim results if len(fixed_phrases) > 8: term1_tf = pd.DataFrame()
# %% include phrases mentioned at least 100 times -> 991 w dfoverall5cap100 = dfoverall5[dfoverall5.Counts >= 100] #%% term5_cap20_bySpeakerParty = m.select_phrases_from_df(dfbypartyspeaker5,dfoverall5cap20,['Speaker Party','Speaker']) term5_cap20_bySpeakerParty.shape #%% term5_cap50_bySpeakerParty = m.select_phrases_from_df(dfbypartyspeaker5,dfoverall5cap50,['Speaker Party','Speaker']) term5_cap50_bySpeakerParty.shape #%% term5_cap100_bySpeakerParty = m.select_phrases_from_df(dfbypartyspeaker5,dfoverall5cap100,['Speaker Party','Speaker']) term5_cap100_bySpeakerParty.shape # %% tfidf top 500 term5_top500_bySpeakerParty_scaled = m.make_share(term5_top500_bySpeakerParty) term5_top500_bySpeakerParty_share = m.make_share(term5_top500_bySpeakerParty, scale=False) # %% tfidf top 1000 term5_top1000_bySpeakerParty_scaled = m.make_share(term5_top1000_bySpeakerParty) term5_top1000_bySpeakerParty_share = m.make_share(term5_top1000_bySpeakerParty, scale=False) #%% cap20 term5_cap20_scaled = m.make_share(term5_cap20_bySpeakerParty) term5_cap20_share = m.make_share(term5_cap20_bySpeakerParty, scale=False) #%% cap50 term5_cap50_scaled = m.make_share(term5_cap50_bySpeakerParty) term5_cap50_share = m.make_share(term5_cap50_bySpeakerParty, scale=False) #%% cap 100 term5_cap100_scaled = m.make_share(term5_cap100_bySpeakerParty) term5_cap100_share = m.make_share(term5_cap100_bySpeakerParty, scale=False)
parties = parties[:n] dfbypartyspeaker = dfbypartyspeaker[dfbypartyspeaker['Speaker Party'].isin(parties)] dfbypartyspeaker_filt = dfbypartyspeaker[dfbypartyspeaker.Phrase.isin(pp1.union(pp2)).apply(lambda x: not x)] #%% dfbypartyspeaker_filt['TotalCounts'] = dfbypartyspeaker_filt.groupby('Phrase')['Counts'].transform('sum') dfbypartyspeaker_filt=dfbypartyspeaker_filt[dfbypartyspeaker_filt.TotalCounts >= N] # %% dftable = dfbypartyspeaker_filt.pivot_table(index=['Speaker Party','Speaker'],columns='Phrase',values='Counts') dftable=dftable.fillna(0) dftable.reset_index(inplace=True) dftable_scaled = m.make_share(dftable) dftable_share = m.make_share(dftable, scale=False) dftable.to_csv(sys.argv[in_len+3+i*3]) dftable_scaled.to_csv(sys.argv[in_len+4+i*3]) dftable_share.to_csv(sys.argv[in_len+5+i*3]) # dfbypartyspeaker = pd.read_pickle('../../interim/t5_byPartySpeaker.pkl') #%% #%% # pp2 = pd.read_pickle('procedural_phrases_SpSvpDistinct.pkl')
import pandas as pd import sys sys.path.append('../../Modules') import modules as m import ast d=pd.DataFrame({'Speaker':[1,2],'Speaker Party':[2,2],'a':[1,2],'b':[2,2]}) d m.make_share(d,scale=False) dfbypartyspeaker = pd.read_pickle('../../../interim/all_byPartySpeakerTerm.pkl') phrases = pd.read_csv('../../../interim/fixd_P2/phrases_all_terms_tfidf_top500each_P2.csv') phrases = phrases.Phrase.apply(ast.literal_eval) phrases4 = pd.read_csv('../../../interim/fixd_P4/phrases_all_terms_tfidf_top250each_P4.csv') phrases4 = phrases4.Phrase.apply(ast.literal_eval) phrasesALL = pd.read_csv('../../../interim/fixed/phrases_all_terms_tfidf_top100each.csv') phrasesALL = phrasesALL.Phrase.apply(ast.literal_eval) p1 = 'Sozialdemokratische Partei der Schweiz (SP)' p2 = 'Schweizerische Volkspartei (SVP)' p3 = 'FDP.Die Liberalen (FDP-Liberale)' p4 = 'Christlichdemokratische Volkspartei der Schweiz (CVP)' parties = [p1,p2,p3,p4] parties2 = parties[:2] dfbypartyspeaker2 = dfbypartyspeaker[dfbypartyspeaker['Speaker Party'].isin(parties2)] dfbypartyspeaker4 = dfbypartyspeaker[dfbypartyspeaker['Speaker Party'].isin(parties)]