def main(BAVfile,sheet_name,LDAdir,modelName): BAV_raw= pandas.read_excel(BAVfile, sheet_name, index_col=0, na_values=['NA']) #Hack!!! Total_Prefer_pct is th last useless col #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1 idx_first = 1 good_cols = [col for col in BAV_raw.columns[idx_first:] if len(col.split("_"))==2 and col.endswith('pct')] BAV_filtered = BAV_raw[good_cols] BAV_filtered.columns = map(lambda x: x.split("_")[0],BAV_filtered.columns) # filter brands - depends onf the dictionay creation way # ie if '-' goes to space this will work # if '-' is dropped tnen will not BAV_filtered = try_drop(BAV_filtered,'General Motors (GM)') BAV_filtered = try_drop(BAV_filtered,'Ford Motor Company') BAV_filtered = try_drop(BAV_filtered,'Smart (car)') BAV_filtered = try_drop(BAV_filtered,'Mini Cooper') BAV_filtered= rename_row(BAV_filtered,'Mercedes-Benz','Mercedes') BAV_filtered = rename_row(BAV_filtered,'Mitsubishi Vehicles','Mitsubishi') BAV_filtered = rename_row(BAV_filtered,'Rolls-Royce','Royce') BAV_filtered = rename_row(BAV_filtered,'Aston Martin','Aston') BAV_filtered = rename_row(BAV_filtered,'Alfa Romeo','Romeo') words= [w.encode() for w in BAV_filtered.columns] brands= [b.encode() for b in BAV_filtered.index] topicsPs = np.genfromtxt(os.path.join(LDAdir,'topics_marginal.csv')) (LDA_df,BrandsInfo,WordsInfo) = getLikes.get_likes(words=words,brands=brands,indir=LDAdir, modelName=modelName) (divs,_,_) = getLikes.get_divs (words,brands,indir=LDAdir, modelName=modelName ,topics_marginal_probs=topicsPs) BAV_filtered = BAV_filtered[LDA_df.columns] BAV_filtered = BAV_filtered.ix[LDA_df.index] dirs = gslib.LDAdirs(modelName,LDAdir) (dict1,_,lda)=gslib.loadStuff(dirs) probs = getLikes.ptopic_given_word(lda,topicsPs) probs_df = pandas.DataFrame(probs, columns=lda.id2word.values()) alls = pandas.concat([ BrandsInfo["IDs"] ,WordsInfo["IDs"]]) x = probs_df[alls] x.columns = alls.index writer = pandas.ExcelWriter(os.path.join(LDAdir,modelName+'_BAV_comp.xlsx')) LDA_df.to_excel(writer, sheet_name='cosine distance') BAV_filtered.to_excel(writer, sheet_name='BAV') divs.to_excel(writer, sheet_name='KL divs') BrandsInfo.to_excel(writer, sheet_name='brands') WordsInfo.to_excel(writer, sheet_name='words') x.to_excel(writer, sheet_name='p_topic_given_word') writer.save return (LDA_df,BAV_filtered,divs,BrandsInfo,WordsInfo)
import numpy as np import os import getLikes import pandas as pd import genSimLDAlib as gslib modelDir = r'Z:\ermunds\results\2012 20topics' modelName = '2012 20topics' topicsPs = np.genfromtxt(os.path.join(modelDir,'topics_marginal.csv')) words = getLikes.words_from_file(r"Z:\ermunds\adjectives.txt") brands =getLikes.words_from_file(r"Z:\ermunds\brands.txt") (divs,_,_) = getLikes.get_divs (words,brands,indir=modelDir, modelName=modelName ,topics_marginal_probs=topicsPs) (sims,b,w) = getLikes.get_likes(words,brands,indir=modelDir, modelName=modelName ) dirs = gslib.LDAdirs(modelName,modelDir) (dict1,_,lda)=gslib.loadStuff(dirs) brands_df = getLikes.pruneWordsList(brands,lda) words_df = getLikes.pruneWordsList(words,lda) probs = getLikes.ptopic_given_word(lda,topicsPs) probs_df = pd.DataFrame(probs, columns=lda.id2word.values()) alls = pd.concat([ brands_df["IDs"] ,words_df["IDs"]]) x = probs_df[alls] x.columns = alls.index
def main(BAVfile, sheet_name, LDAdir, modelName): BAV_raw = pandas.read_excel(BAVfile, sheet_name, index_col=0, na_values=['NA']) #Hack!!! Total_Prefer_pct is th last useless col #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1 idx_first = 1 good_cols = [ col for col in BAV_raw.columns[idx_first:] if len(col.split("_")) == 2 and col.endswith('pct') ] BAV_filtered = BAV_raw[good_cols] BAV_filtered.columns = map(lambda x: x.split("_")[0], BAV_filtered.columns) # filter brands - depends onf the dictionay creation way # ie if '-' goes to space this will work # if '-' is dropped tnen will not BAV_filtered = try_drop(BAV_filtered, 'General Motors (GM)') BAV_filtered = try_drop(BAV_filtered, 'Ford Motor Company') BAV_filtered = try_drop(BAV_filtered, 'Smart (car)') BAV_filtered = try_drop(BAV_filtered, 'Mini Cooper') BAV_filtered = rename_row(BAV_filtered, 'Mercedes-Benz', 'Mercedes') BAV_filtered = rename_row(BAV_filtered, 'Mitsubishi Vehicles', 'Mitsubishi') BAV_filtered = rename_row(BAV_filtered, 'Rolls-Royce', 'Royce') BAV_filtered = rename_row(BAV_filtered, 'Aston Martin', 'Aston') BAV_filtered = rename_row(BAV_filtered, 'Alfa Romeo', 'Romeo') words = [w.encode() for w in BAV_filtered.columns] brands = [b.encode() for b in BAV_filtered.index] topicsPs = np.genfromtxt(os.path.join(LDAdir, 'topics_marginal.csv')) (LDA_df, BrandsInfo, WordsInfo) = getLikes.get_likes(words=words, brands=brands, indir=LDAdir, modelName=modelName) (divs, _, _) = getLikes.get_divs(words, brands, indir=LDAdir, modelName=modelName, topics_marginal_probs=topicsPs) BAV_filtered = BAV_filtered[LDA_df.columns] BAV_filtered = BAV_filtered.ix[LDA_df.index] dirs = gslib.LDAdirs(modelName, LDAdir) (dict1, _, lda) = gslib.loadStuff(dirs) probs = getLikes.ptopic_given_word(lda, topicsPs) probs_df = pandas.DataFrame(probs, columns=lda.id2word.values()) alls = pandas.concat([BrandsInfo["IDs"], WordsInfo["IDs"]]) x = probs_df[alls] x.columns = alls.index writer = pandas.ExcelWriter( os.path.join(LDAdir, modelName + '_BAV_comp.xlsx')) LDA_df.to_excel(writer, sheet_name='cosine distance') BAV_filtered.to_excel(writer, sheet_name='BAV') divs.to_excel(writer, sheet_name='KL divs') BrandsInfo.to_excel(writer, sheet_name='brands') WordsInfo.to_excel(writer, sheet_name='words') x.to_excel(writer, sheet_name='p_topic_given_word') writer.save return (LDA_df, BAV_filtered, divs, BrandsInfo, WordsInfo)
import genSimLDAlib as gslib modelDir = r'Z:\ermunds\results\2012 20topics' modelName = '2012 20topics' topicsPs = np.genfromtxt(os.path.join(modelDir, 'topics_marginal.csv')) words = getLikes.words_from_file(r"Z:\ermunds\adjectives.txt") brands = getLikes.words_from_file(r"Z:\ermunds\brands.txt") (divs, _, _) = getLikes.get_divs(words, brands, indir=modelDir, modelName=modelName, topics_marginal_probs=topicsPs) (sims, b, w) = getLikes.get_likes(words, brands, indir=modelDir, modelName=modelName) dirs = gslib.LDAdirs(modelName, modelDir) (dict1, _, lda) = gslib.loadStuff(dirs) brands_df = getLikes.pruneWordsList(brands, lda) words_df = getLikes.pruneWordsList(words, lda) probs = getLikes.ptopic_given_word(lda, topicsPs) probs_df = pd.DataFrame(probs, columns=lda.id2word.values()) alls = pd.concat([brands_df["IDs"], words_df["IDs"]]) x = probs_df[alls] x.columns = alls.index writer = pd.ExcelWriter(os.path.join(modelDir, modelName + '_new.xlsx'))