def main(BAVfile,sheet_name,LDAdir,modelName): BAV_raw= pandas.read_excel(BAVfile, sheet_name, index_col=0, na_values=['NA']) #Hack!!! Total_Prefer_pct is th last useless col #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1 idx_first = 1 good_cols = [col for col in BAV_raw.columns[idx_first:] if len(col.split("_"))==2 and col.endswith('pct')] BAV_filtered = BAV_raw[good_cols] BAV_filtered.columns = map(lambda x: x.split("_")[0],BAV_filtered.columns) # filter brands - depends onf the dictionay creation way # ie if '-' goes to space this will work # if '-' is dropped tnen will not BAV_filtered = try_drop(BAV_filtered,'General Motors (GM)') BAV_filtered = try_drop(BAV_filtered,'Ford Motor Company') BAV_filtered = try_drop(BAV_filtered,'Smart (car)') BAV_filtered = try_drop(BAV_filtered,'Mini Cooper') BAV_filtered= rename_row(BAV_filtered,'Mercedes-Benz','Mercedes') BAV_filtered = rename_row(BAV_filtered,'Mitsubishi Vehicles','Mitsubishi') BAV_filtered = rename_row(BAV_filtered,'Rolls-Royce','Royce') BAV_filtered = rename_row(BAV_filtered,'Aston Martin','Aston') BAV_filtered = rename_row(BAV_filtered,'Alfa Romeo','Romeo') words= [w.encode() for w in BAV_filtered.columns] brands= [b.encode() for b in BAV_filtered.index] topicsPs = np.genfromtxt(os.path.join(LDAdir,'topics_marginal.csv')) (LDA_df,BrandsInfo,WordsInfo) = getLikes.get_likes(words=words,brands=brands,indir=LDAdir, modelName=modelName) (divs,_,_) = getLikes.get_divs (words,brands,indir=LDAdir, modelName=modelName ,topics_marginal_probs=topicsPs) BAV_filtered = BAV_filtered[LDA_df.columns] BAV_filtered = BAV_filtered.ix[LDA_df.index] dirs = gslib.LDAdirs(modelName,LDAdir) (dict1,_,lda)=gslib.loadStuff(dirs) probs = getLikes.ptopic_given_word(lda,topicsPs) probs_df = pandas.DataFrame(probs, columns=lda.id2word.values()) alls = pandas.concat([ BrandsInfo["IDs"] ,WordsInfo["IDs"]]) x = probs_df[alls] x.columns = alls.index writer = pandas.ExcelWriter(os.path.join(LDAdir,modelName+'_BAV_comp.xlsx')) LDA_df.to_excel(writer, sheet_name='cosine distance') BAV_filtered.to_excel(writer, sheet_name='BAV') divs.to_excel(writer, sheet_name='KL divs') BrandsInfo.to_excel(writer, sheet_name='brands') WordsInfo.to_excel(writer, sheet_name='words') x.to_excel(writer, sheet_name='p_topic_given_word') writer.save return (LDA_df,BAV_filtered,divs,BrandsInfo,WordsInfo)
brands =getLikes.words_from_file(r"Z:\ermunds\brands.txt") (divs,_,_) = getLikes.get_divs (words,brands,indir=modelDir, modelName=modelName ,topics_marginal_probs=topicsPs) (sims,b,w) = getLikes.get_likes(words,brands,indir=modelDir, modelName=modelName ) dirs = gslib.LDAdirs(modelName,modelDir) (dict1,_,lda)=gslib.loadStuff(dirs) brands_df = getLikes.pruneWordsList(brands,lda) words_df = getLikes.pruneWordsList(words,lda) probs = getLikes.ptopic_given_word(lda,topicsPs) probs_df = pd.DataFrame(probs, columns=lda.id2word.values()) alls = pd.concat([ brands_df["IDs"] ,words_df["IDs"]]) x = probs_df[alls] x.columns = alls.index writer = pd.ExcelWriter(os.path.join(modelDir,modelName+'_new.xlsx')) sims.to_excel(writer, sheet_name='cosine distance') divs.to_excel(writer, sheet_name='KL divs') b.to_excel(writer, sheet_name='brands') w.to_excel(writer, sheet_name='words') x.to_excel(writer, sheet_name='p_topic_given_word') writer.save() words = "Different Distinctive Unique Dynamic Innovative Leader Reliable Arrogant Authentic Carefree Charming Daring Energetic Friendly Fun Glamorous Healthy Helpful Independent Intelligent Kind Obliging Original Prestigious Progressive Restrained Rugged Sensuous Simple Social Straightforward Stylish Traditional Trendy Trustworthy Unapproachable"
def main(BAVfile, sheet_name, LDAdir, modelName): BAV_raw = pandas.read_excel(BAVfile, sheet_name, index_col=0, na_values=['NA']) #Hack!!! Total_Prefer_pct is th last useless col #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1 idx_first = 1 good_cols = [ col for col in BAV_raw.columns[idx_first:] if len(col.split("_")) == 2 and col.endswith('pct') ] BAV_filtered = BAV_raw[good_cols] BAV_filtered.columns = map(lambda x: x.split("_")[0], BAV_filtered.columns) # filter brands - depends onf the dictionay creation way # ie if '-' goes to space this will work # if '-' is dropped tnen will not BAV_filtered = try_drop(BAV_filtered, 'General Motors (GM)') BAV_filtered = try_drop(BAV_filtered, 'Ford Motor Company') BAV_filtered = try_drop(BAV_filtered, 'Smart (car)') BAV_filtered = try_drop(BAV_filtered, 'Mini Cooper') BAV_filtered = rename_row(BAV_filtered, 'Mercedes-Benz', 'Mercedes') BAV_filtered = rename_row(BAV_filtered, 'Mitsubishi Vehicles', 'Mitsubishi') BAV_filtered = rename_row(BAV_filtered, 'Rolls-Royce', 'Royce') BAV_filtered = rename_row(BAV_filtered, 'Aston Martin', 'Aston') BAV_filtered = rename_row(BAV_filtered, 'Alfa Romeo', 'Romeo') words = [w.encode() for w in BAV_filtered.columns] brands = [b.encode() for b in BAV_filtered.index] topicsPs = np.genfromtxt(os.path.join(LDAdir, 'topics_marginal.csv')) (LDA_df, BrandsInfo, WordsInfo) = getLikes.get_likes(words=words, brands=brands, indir=LDAdir, modelName=modelName) (divs, _, _) = getLikes.get_divs(words, brands, indir=LDAdir, modelName=modelName, topics_marginal_probs=topicsPs) BAV_filtered = BAV_filtered[LDA_df.columns] BAV_filtered = BAV_filtered.ix[LDA_df.index] dirs = gslib.LDAdirs(modelName, LDAdir) (dict1, _, lda) = gslib.loadStuff(dirs) probs = getLikes.ptopic_given_word(lda, topicsPs) probs_df = pandas.DataFrame(probs, columns=lda.id2word.values()) alls = pandas.concat([BrandsInfo["IDs"], WordsInfo["IDs"]]) x = probs_df[alls] x.columns = alls.index writer = pandas.ExcelWriter( os.path.join(LDAdir, modelName + '_BAV_comp.xlsx')) LDA_df.to_excel(writer, sheet_name='cosine distance') BAV_filtered.to_excel(writer, sheet_name='BAV') divs.to_excel(writer, sheet_name='KL divs') BrandsInfo.to_excel(writer, sheet_name='brands') WordsInfo.to_excel(writer, sheet_name='words') x.to_excel(writer, sheet_name='p_topic_given_word') writer.save return (LDA_df, BAV_filtered, divs, BrandsInfo, WordsInfo)
brands, indir=modelDir, modelName=modelName, topics_marginal_probs=topicsPs) (sims, b, w) = getLikes.get_likes(words, brands, indir=modelDir, modelName=modelName) dirs = gslib.LDAdirs(modelName, modelDir) (dict1, _, lda) = gslib.loadStuff(dirs) brands_df = getLikes.pruneWordsList(brands, lda) words_df = getLikes.pruneWordsList(words, lda) probs = getLikes.ptopic_given_word(lda, topicsPs) probs_df = pd.DataFrame(probs, columns=lda.id2word.values()) alls = pd.concat([brands_df["IDs"], words_df["IDs"]]) x = probs_df[alls] x.columns = alls.index writer = pd.ExcelWriter(os.path.join(modelDir, modelName + '_new.xlsx')) sims.to_excel(writer, sheet_name='cosine distance') divs.to_excel(writer, sheet_name='KL divs') b.to_excel(writer, sheet_name='brands') w.to_excel(writer, sheet_name='words') x.to_excel(writer, sheet_name='p_topic_given_word') writer.save() words = "Different Distinctive Unique Dynamic Innovative Leader Reliable Arrogant Authentic Carefree Charming Daring Energetic Friendly Fun Glamorous Healthy Helpful Independent Intelligent Kind Obliging Original Prestigious Progressive Restrained Rugged Sensuous Simple Social Straightforward Stylish Traditional Trendy Trustworthy Unapproachable" words = words.split()