Ejemplo n.º 1
0
def main(BAVfile,sheet_name,LDAdir,modelName):
    BAV_raw= pandas.read_excel(BAVfile, sheet_name, index_col=0, na_values=['NA'])
    #Hack!!! Total_Prefer_pct is th last useless col
    #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1
    idx_first = 1
    good_cols = [col for col in BAV_raw.columns[idx_first:] if len(col.split("_"))==2 and col.endswith('pct')]
    
    BAV_filtered = BAV_raw[good_cols]
    BAV_filtered.columns = map(lambda x: x.split("_")[0],BAV_filtered.columns)
    
    # filter brands - depends onf the dictionay creation way
    # ie if '-' goes to space this will work
    # if '-' is dropped tnen will not
    
    BAV_filtered = try_drop(BAV_filtered,'General Motors (GM)')
    BAV_filtered = try_drop(BAV_filtered,'Ford Motor Company')
    BAV_filtered = try_drop(BAV_filtered,'Smart (car)')
    BAV_filtered = try_drop(BAV_filtered,'Mini Cooper')
    
    
    BAV_filtered= rename_row(BAV_filtered,'Mercedes-Benz','Mercedes')
    BAV_filtered = rename_row(BAV_filtered,'Mitsubishi Vehicles','Mitsubishi')
    BAV_filtered = rename_row(BAV_filtered,'Rolls-Royce','Royce')
    BAV_filtered = rename_row(BAV_filtered,'Aston Martin','Aston')
    BAV_filtered = rename_row(BAV_filtered,'Alfa Romeo','Romeo')
    
    
    
    words=  [w.encode() for w in BAV_filtered.columns]
    brands= [b.encode() for b in BAV_filtered.index]
    
    topicsPs = np.genfromtxt(os.path.join(LDAdir,'topics_marginal.csv'))
    (LDA_df,BrandsInfo,WordsInfo) = getLikes.get_likes(words=words,brands=brands,indir=LDAdir, modelName=modelName)
    (divs,_,_) = getLikes.get_divs (words,brands,indir=LDAdir, modelName=modelName ,topics_marginal_probs=topicsPs)
    
    
    BAV_filtered = BAV_filtered[LDA_df.columns]
    BAV_filtered = BAV_filtered.ix[LDA_df.index]
    
    dirs = gslib.LDAdirs(modelName,LDAdir)
    (dict1,_,lda)=gslib.loadStuff(dirs)  
    probs = getLikes.ptopic_given_word(lda,topicsPs)
    probs_df =  pandas.DataFrame(probs, columns=lda.id2word.values())
    alls = pandas.concat([ BrandsInfo["IDs"] ,WordsInfo["IDs"]])
    x = probs_df[alls]
    x.columns = alls.index
    
    writer = pandas.ExcelWriter(os.path.join(LDAdir,modelName+'_BAV_comp.xlsx'))
    LDA_df.to_excel(writer, sheet_name='cosine distance')
    BAV_filtered.to_excel(writer, sheet_name='BAV')
    divs.to_excel(writer, sheet_name='KL divs') 
    BrandsInfo.to_excel(writer, sheet_name='brands')
    WordsInfo.to_excel(writer, sheet_name='words')
    
    x.to_excel(writer, sheet_name='p_topic_given_word')
    writer.save
    return (LDA_df,BAV_filtered,divs,BrandsInfo,WordsInfo)
brands =getLikes.words_from_file(r"Z:\ermunds\brands.txt") 


(divs,_,_) = getLikes.get_divs (words,brands,indir=modelDir, modelName=modelName ,topics_marginal_probs=topicsPs)
(sims,b,w) = getLikes.get_likes(words,brands,indir=modelDir, modelName=modelName )




dirs = gslib.LDAdirs(modelName,modelDir)
(dict1,_,lda)=gslib.loadStuff(dirs)  

brands_df = getLikes.pruneWordsList(brands,lda)
words_df = getLikes.pruneWordsList(words,lda)

probs = getLikes.ptopic_given_word(lda,topicsPs)
probs_df =  pd.DataFrame(probs, columns=lda.id2word.values())
alls = pd.concat([ brands_df["IDs"] ,words_df["IDs"]])
x = probs_df[alls]
x.columns = alls.index


writer = pd.ExcelWriter(os.path.join(modelDir,modelName+'_new.xlsx'))
sims.to_excel(writer, sheet_name='cosine distance')
divs.to_excel(writer, sheet_name='KL divs')
b.to_excel(writer, sheet_name='brands')
w.to_excel(writer, sheet_name='words')
x.to_excel(writer, sheet_name='p_topic_given_word')
writer.save()

words = "Different	Distinctive	Unique	Dynamic	Innovative	Leader	Reliable	Arrogant	Authentic	Carefree	Charming	Daring	Energetic	Friendly	Fun	Glamorous	Healthy	Helpful	Independent	Intelligent	Kind	Obliging	Original	Prestigious	Progressive	Restrained	Rugged	Sensuous	Simple	Social	Straightforward	Stylish	Traditional	Trendy	Trustworthy	Unapproachable"
Ejemplo n.º 3
0
def main(BAVfile, sheet_name, LDAdir, modelName):
    BAV_raw = pandas.read_excel(BAVfile,
                                sheet_name,
                                index_col=0,
                                na_values=['NA'])
    #Hack!!! Total_Prefer_pct is th last useless col
    #idx_first = BAV_raw.columns.get_loc('Brand_Asset_C')+1
    idx_first = 1
    good_cols = [
        col for col in BAV_raw.columns[idx_first:]
        if len(col.split("_")) == 2 and col.endswith('pct')
    ]

    BAV_filtered = BAV_raw[good_cols]
    BAV_filtered.columns = map(lambda x: x.split("_")[0], BAV_filtered.columns)

    # filter brands - depends onf the dictionay creation way
    # ie if '-' goes to space this will work
    # if '-' is dropped tnen will not

    BAV_filtered = try_drop(BAV_filtered, 'General Motors (GM)')
    BAV_filtered = try_drop(BAV_filtered, 'Ford Motor Company')
    BAV_filtered = try_drop(BAV_filtered, 'Smart (car)')
    BAV_filtered = try_drop(BAV_filtered, 'Mini Cooper')

    BAV_filtered = rename_row(BAV_filtered, 'Mercedes-Benz', 'Mercedes')
    BAV_filtered = rename_row(BAV_filtered, 'Mitsubishi Vehicles',
                              'Mitsubishi')
    BAV_filtered = rename_row(BAV_filtered, 'Rolls-Royce', 'Royce')
    BAV_filtered = rename_row(BAV_filtered, 'Aston Martin', 'Aston')
    BAV_filtered = rename_row(BAV_filtered, 'Alfa Romeo', 'Romeo')

    words = [w.encode() for w in BAV_filtered.columns]
    brands = [b.encode() for b in BAV_filtered.index]

    topicsPs = np.genfromtxt(os.path.join(LDAdir, 'topics_marginal.csv'))
    (LDA_df, BrandsInfo, WordsInfo) = getLikes.get_likes(words=words,
                                                         brands=brands,
                                                         indir=LDAdir,
                                                         modelName=modelName)
    (divs, _, _) = getLikes.get_divs(words,
                                     brands,
                                     indir=LDAdir,
                                     modelName=modelName,
                                     topics_marginal_probs=topicsPs)

    BAV_filtered = BAV_filtered[LDA_df.columns]
    BAV_filtered = BAV_filtered.ix[LDA_df.index]

    dirs = gslib.LDAdirs(modelName, LDAdir)
    (dict1, _, lda) = gslib.loadStuff(dirs)
    probs = getLikes.ptopic_given_word(lda, topicsPs)
    probs_df = pandas.DataFrame(probs, columns=lda.id2word.values())
    alls = pandas.concat([BrandsInfo["IDs"], WordsInfo["IDs"]])
    x = probs_df[alls]
    x.columns = alls.index

    writer = pandas.ExcelWriter(
        os.path.join(LDAdir, modelName + '_BAV_comp.xlsx'))
    LDA_df.to_excel(writer, sheet_name='cosine distance')
    BAV_filtered.to_excel(writer, sheet_name='BAV')
    divs.to_excel(writer, sheet_name='KL divs')
    BrandsInfo.to_excel(writer, sheet_name='brands')
    WordsInfo.to_excel(writer, sheet_name='words')

    x.to_excel(writer, sheet_name='p_topic_given_word')
    writer.save
    return (LDA_df, BAV_filtered, divs, BrandsInfo, WordsInfo)
                                 brands,
                                 indir=modelDir,
                                 modelName=modelName,
                                 topics_marginal_probs=topicsPs)
(sims, b, w) = getLikes.get_likes(words,
                                  brands,
                                  indir=modelDir,
                                  modelName=modelName)

dirs = gslib.LDAdirs(modelName, modelDir)
(dict1, _, lda) = gslib.loadStuff(dirs)

brands_df = getLikes.pruneWordsList(brands, lda)
words_df = getLikes.pruneWordsList(words, lda)

probs = getLikes.ptopic_given_word(lda, topicsPs)
probs_df = pd.DataFrame(probs, columns=lda.id2word.values())
alls = pd.concat([brands_df["IDs"], words_df["IDs"]])
x = probs_df[alls]
x.columns = alls.index

writer = pd.ExcelWriter(os.path.join(modelDir, modelName + '_new.xlsx'))
sims.to_excel(writer, sheet_name='cosine distance')
divs.to_excel(writer, sheet_name='KL divs')
b.to_excel(writer, sheet_name='brands')
w.to_excel(writer, sheet_name='words')
x.to_excel(writer, sheet_name='p_topic_given_word')
writer.save()

words = "Different	Distinctive	Unique	Dynamic	Innovative	Leader	Reliable	Arrogant	Authentic	Carefree	Charming	Daring	Energetic	Friendly	Fun	Glamorous	Healthy	Helpful	Independent	Intelligent	Kind	Obliging	Original	Prestigious	Progressive	Restrained	Rugged	Sensuous	Simple	Social	Straightforward	Stylish	Traditional	Trendy	Trustworthy	Unapproachable"
words = words.split()