コード例 #1
0
def main(BAVfile,sheet_name,LDAdir,modelName):
    BAV_raw= pandas.read_excel(BAVfile, sheet_name, index_col=0, na_values=['NA'])
    #Hack!!! Total_Prefer_pct is th last useless col
    #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1
    idx_first = 1
    good_cols = [col for col in BAV_raw.columns[idx_first:] if len(col.split("_"))==2 and col.endswith('pct')]
    
    BAV_filtered = BAV_raw[good_cols]
    BAV_filtered.columns = map(lambda x: x.split("_")[0],BAV_filtered.columns)
    
    # filter brands - depends onf the dictionay creation way
    # ie if '-' goes to space this will work
    # if '-' is dropped tnen will not
    
    BAV_filtered = try_drop(BAV_filtered,'General Motors (GM)')
    BAV_filtered = try_drop(BAV_filtered,'Ford Motor Company')
    BAV_filtered = try_drop(BAV_filtered,'Smart (car)')
    BAV_filtered = try_drop(BAV_filtered,'Mini Cooper')
    
    
    BAV_filtered= rename_row(BAV_filtered,'Mercedes-Benz','Mercedes')
    BAV_filtered = rename_row(BAV_filtered,'Mitsubishi Vehicles','Mitsubishi')
    BAV_filtered = rename_row(BAV_filtered,'Rolls-Royce','Royce')
    BAV_filtered = rename_row(BAV_filtered,'Aston Martin','Aston')
    BAV_filtered = rename_row(BAV_filtered,'Alfa Romeo','Romeo')
    
    
    
    words=  [w.encode() for w in BAV_filtered.columns]
    brands= [b.encode() for b in BAV_filtered.index]
    
    topicsPs = np.genfromtxt(os.path.join(LDAdir,'topics_marginal.csv'))
    (LDA_df,BrandsInfo,WordsInfo) = getLikes.get_likes(words=words,brands=brands,indir=LDAdir, modelName=modelName)
    (divs,_,_) = getLikes.get_divs (words,brands,indir=LDAdir, modelName=modelName ,topics_marginal_probs=topicsPs)
    
    
    BAV_filtered = BAV_filtered[LDA_df.columns]
    BAV_filtered = BAV_filtered.ix[LDA_df.index]
    
    dirs = gslib.LDAdirs(modelName,LDAdir)
    (dict1,_,lda)=gslib.loadStuff(dirs)  
    probs = getLikes.ptopic_given_word(lda,topicsPs)
    probs_df =  pandas.DataFrame(probs, columns=lda.id2word.values())
    alls = pandas.concat([ BrandsInfo["IDs"] ,WordsInfo["IDs"]])
    x = probs_df[alls]
    x.columns = alls.index
    
    writer = pandas.ExcelWriter(os.path.join(LDAdir,modelName+'_BAV_comp.xlsx'))
    LDA_df.to_excel(writer, sheet_name='cosine distance')
    BAV_filtered.to_excel(writer, sheet_name='BAV')
    divs.to_excel(writer, sheet_name='KL divs') 
    BrandsInfo.to_excel(writer, sheet_name='brands')
    WordsInfo.to_excel(writer, sheet_name='words')
    
    x.to_excel(writer, sheet_name='p_topic_given_word')
    writer.save
    return (LDA_df,BAV_filtered,divs,BrandsInfo,WordsInfo)
コード例 #2
0
import numpy as np
import os
import getLikes
import pandas as pd
import genSimLDAlib as gslib

modelDir = r'Z:\ermunds\results\2012 20topics'
modelName = '2012 20topics' 
topicsPs = np.genfromtxt(os.path.join(modelDir,'topics_marginal.csv'))

words = getLikes.words_from_file(r"Z:\ermunds\adjectives.txt")
brands =getLikes.words_from_file(r"Z:\ermunds\brands.txt") 


(divs,_,_) = getLikes.get_divs (words,brands,indir=modelDir, modelName=modelName ,topics_marginal_probs=topicsPs)
(sims,b,w) = getLikes.get_likes(words,brands,indir=modelDir, modelName=modelName )




dirs = gslib.LDAdirs(modelName,modelDir)
(dict1,_,lda)=gslib.loadStuff(dirs)  

brands_df = getLikes.pruneWordsList(brands,lda)
words_df = getLikes.pruneWordsList(words,lda)

probs = getLikes.ptopic_given_word(lda,topicsPs)
probs_df =  pd.DataFrame(probs, columns=lda.id2word.values())
alls = pd.concat([ brands_df["IDs"] ,words_df["IDs"]])
x = probs_df[alls]
x.columns = alls.index
コード例 #3
0
def main(BAVfile, sheet_name, LDAdir, modelName):
    BAV_raw = pandas.read_excel(BAVfile,
                                sheet_name,
                                index_col=0,
                                na_values=['NA'])
    #Hack!!! Total_Prefer_pct is th last useless col
    #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1
    idx_first = 1
    good_cols = [
        col for col in BAV_raw.columns[idx_first:]
        if len(col.split("_")) == 2 and col.endswith('pct')
    ]

    BAV_filtered = BAV_raw[good_cols]
    BAV_filtered.columns = map(lambda x: x.split("_")[0], BAV_filtered.columns)

    # filter brands - depends onf the dictionay creation way
    # ie if '-' goes to space this will work
    # if '-' is dropped tnen will not

    BAV_filtered = try_drop(BAV_filtered, 'General Motors (GM)')
    BAV_filtered = try_drop(BAV_filtered, 'Ford Motor Company')
    BAV_filtered = try_drop(BAV_filtered, 'Smart (car)')
    BAV_filtered = try_drop(BAV_filtered, 'Mini Cooper')

    BAV_filtered = rename_row(BAV_filtered, 'Mercedes-Benz', 'Mercedes')
    BAV_filtered = rename_row(BAV_filtered, 'Mitsubishi Vehicles',
                              'Mitsubishi')
    BAV_filtered = rename_row(BAV_filtered, 'Rolls-Royce', 'Royce')
    BAV_filtered = rename_row(BAV_filtered, 'Aston Martin', 'Aston')
    BAV_filtered = rename_row(BAV_filtered, 'Alfa Romeo', 'Romeo')

    words = [w.encode() for w in BAV_filtered.columns]
    brands = [b.encode() for b in BAV_filtered.index]

    topicsPs = np.genfromtxt(os.path.join(LDAdir, 'topics_marginal.csv'))
    (LDA_df, BrandsInfo, WordsInfo) = getLikes.get_likes(words=words,
                                                         brands=brands,
                                                         indir=LDAdir,
                                                         modelName=modelName)
    (divs, _, _) = getLikes.get_divs(words,
                                     brands,
                                     indir=LDAdir,
                                     modelName=modelName,
                                     topics_marginal_probs=topicsPs)

    BAV_filtered = BAV_filtered[LDA_df.columns]
    BAV_filtered = BAV_filtered.ix[LDA_df.index]

    dirs = gslib.LDAdirs(modelName, LDAdir)
    (dict1, _, lda) = gslib.loadStuff(dirs)
    probs = getLikes.ptopic_given_word(lda, topicsPs)
    probs_df = pandas.DataFrame(probs, columns=lda.id2word.values())
    alls = pandas.concat([BrandsInfo["IDs"], WordsInfo["IDs"]])
    x = probs_df[alls]
    x.columns = alls.index

    writer = pandas.ExcelWriter(
        os.path.join(LDAdir, modelName + '_BAV_comp.xlsx'))
    LDA_df.to_excel(writer, sheet_name='cosine distance')
    BAV_filtered.to_excel(writer, sheet_name='BAV')
    divs.to_excel(writer, sheet_name='KL divs')
    BrandsInfo.to_excel(writer, sheet_name='brands')
    WordsInfo.to_excel(writer, sheet_name='words')

    x.to_excel(writer, sheet_name='p_topic_given_word')
    writer.save
    return (LDA_df, BAV_filtered, divs, BrandsInfo, WordsInfo)
コード例 #4
0
import genSimLDAlib as gslib

modelDir = r'Z:\ermunds\results\2012 20topics'
modelName = '2012 20topics'
topicsPs = np.genfromtxt(os.path.join(modelDir, 'topics_marginal.csv'))

words = getLikes.words_from_file(r"Z:\ermunds\adjectives.txt")
brands = getLikes.words_from_file(r"Z:\ermunds\brands.txt")

(divs, _, _) = getLikes.get_divs(words,
                                 brands,
                                 indir=modelDir,
                                 modelName=modelName,
                                 topics_marginal_probs=topicsPs)
(sims, b, w) = getLikes.get_likes(words,
                                  brands,
                                  indir=modelDir,
                                  modelName=modelName)

dirs = gslib.LDAdirs(modelName, modelDir)
(dict1, _, lda) = gslib.loadStuff(dirs)

brands_df = getLikes.pruneWordsList(brands, lda)
words_df = getLikes.pruneWordsList(words, lda)

probs = getLikes.ptopic_given_word(lda, topicsPs)
probs_df = pd.DataFrame(probs, columns=lda.id2word.values())
alls = pd.concat([brands_df["IDs"], words_df["IDs"]])
x = probs_df[alls]
x.columns = alls.index

writer = pd.ExcelWriter(os.path.join(modelDir, modelName + '_new.xlsx'))