Ejemplo n.º 1
0
def merge_att(prd):
    infile = "01att_brand.csv"
    df = pd.read_csv(hd.path_sub+infile)
    prd = pd.merge(prd,df,on='pid',how='left')
    
    infile = "02att_color_power_material.csv"
    df = pd.read_csv(hd.path_sub+infile)
    prd = pd.merge(prd,df,on='pid',how='left')
    prd.att_color.fillna('',inplace=True)
    prd.att_power.fillna('',inplace=True)
    prd.att_material.fillna('',inplace=True)
    
    infile = "03att_size_weight.csv"
    df = pd.read_csv(hd.path_sub+infile)
    prd = pd.merge(prd,df,on='pid',how='left')
    prd.att_size.fillna('',inplace=True)
    prd.att_weight.fillna('',inplace=True)
    
    prd['p_brand_'] = prd.p_brand.map(lambda x:hd.str_stem(x)).str.split()
    prd['att_color_'] = prd.att_color.str.split()
    prd['att_power_'] = prd.att_power.str.split()
    prd['att_material_'] = prd.att_material.str.split()
    prd['att_size_'] = prd.att_size.str.split()
    prd['att_weight_'] = prd.att_weight.str.split()
    
    return prd
Ejemplo n.º 2
0
def get_att_sub(base,name):
    pid = base.pid.tolist()
    att = hd.load_att()
    att = att[att.name==name]
    del att['name']
    att.columns = [['pid',name]]
    att = att[att.pid.isin(pid)]
    att[name] = att[name].map(lambda x:hd.str_stem(x)).str.split()
    dup_pids = att[att.pid.duplicated()].pid.tolist()
    dup_inds = att[att.pid.duplicated()].index.tolist()#second ind
    if len(dup_pids)>0:
        for i,j in zip(dup_pids,dup_inds):
            att[name][j] = sum(att[att.pid==i][name].tolist(),[])
        att.drop_duplicates(subset='pid',keep='last',inplace=True)
    return pd.merge(base,att,on='pid',how='left')
Ejemplo n.º 3
0
if os.name != 'nt':
    'Mac'
    path_org = "/Users/Kazuki/home-depot/ORG//"
    path_sub = "/Users/Kazuki/home-depot/SUB//"
else:
    'Win'
    path_org = "D:\COMPE\KAGGLE\home-depot\ORG\\"
    path_sub = "D:\COMPE\KAGGLE\home-depot\SUB\\"

infile = "train.csv"
train = pd.read_csv(path_org + infile)
#['id', 'product_uid', 'product_title', 'search_term', 'relevance']
train.columns = [['id', 'pid', 't', 's', 'r']]

train['t_'] = train.t.map(lambda x: hd.str_stem(x)).str.split()
train['s_'] = train.s.map(lambda x: hd.str_stem(x)).str.split()

#==============================================================================
#
#==============================================================================
s_table = hd.mk_freq_table(train.s_)

s_table = s_table.to_frame()

li = []
for i in s_table.index:
    sum_r = 0
    cnt = 0
    for j in train.index:
        if i in train.s_[j]:
Ejemplo n.º 4
0
items = pd.read_csv(hd.path_sub + '06sort_items_query.csv',
                    index_col=0)['0'].tolist()
train['s_item_myrule'] = train.s_.map(lambda x: hd.get_sitem(x))
train['s_item_list'] = train.s_.map(
    lambda x: hd.get_item_list_in_list(items, x))
train['s_item'] = ''
for i in train.index:
    if train['s_item_list'].values[i] == '':
        train['s_item'].values[i] = train['s_item_myrule'].values[i]
    else:
        train['s_item'].values[i] = train['s_item_list'].values[i]

items = pd.read_csv(hd.path_sub + '06sort_items_title.csv',
                    index_col=0)['0'].tolist()
prd['t_item_myrule'] = hd.hdp.pred_item_in_title(prd)
prd['t_item_myrule'] = prd.t_item_myrule.map(lambda x: hd.str_stem(x))
prd['t_item_list'] = prd.t_.map(lambda x: hd.get_item_list_in_list(items, x))
prd['t_item'] = ''
for i in prd.index:
    if prd['t_item_list'].values[i] == '':
        prd['t_item'].values[i] = prd['t_item_myrule'].values[i]
    else:
        prd['t_item'].values[i] = prd['t_item_list'].values[i]
merged = pd.merge(train, prd, on='pid', how='left')
#==============================================================================
#
#==============================================================================
#merged = merged[['s_','t_','r']]
#
#def del_num(li):
#    li = [x for x in li if x.isalpha()]
Ejemplo n.º 5
0
#['id', 'product_uid', 'product_title', 'search_term']
test.columns = [['id', 'pid', 't', 's']]
"""train test"""
tt = pd.concat([train, test])
tt.reset_index(drop=True, inplace=True)
"""product_table"""
prd = pd.concat([train, test])
prd = prd.drop_duplicates(subset='pid')
prd.drop(['id', 's', 'r'], axis=1, inplace=True)
""""============================
    SEARCH TERMS FEATURES
    ============================"""
print '========== SEARCH =========='
tt['slen'] = tt.s.str.len()
tt['sfix'] = tt.s.map(lambda x: hd.fix_typo_google(x))
tt['s_'] = tt.sfix.map(lambda x: hd.str_stem(x)).str.split()
tt['s_'] = tt.s_.map(lambda x: hd.fix_typo(x))
tt['typo'] = 1 - (tt.s.map(lambda x: hd.str_stem(x)).str.split() == tt.s_) * 1
#    tt['s_'] = tt.s_.map(lambda x:del_ng(x))
tt['s_len'] = tt.s_.str.len()
del tt['t']
del test
del train
del desc
""""============================
    PRODUCT FEATURES
    ============================"""
print '========== PRODUCT =========='
prd = prd.sort_values(by='pid')
"TITLE"
prd.reset_index(drop=True, inplace=True)
Ejemplo n.º 6
0
desc.columns = [['pid', 'd']]

infile = "test.csv"
test = pd.read_csv(path_org + infile)
#['id', 'product_uid', 'product_title', 'search_term']
test.columns = [['id', 'pid', 't', 's']]
"""product_table"""
prd = pd.concat([train, test])
del train
del test
prd = prd.drop_duplicates(subset='pid')
prd.drop(['id', 's', 'r'], axis=1, inplace=True)

prd = prd.sort_values(by='pid')
"TITLE"
prd.loc[:, 't_'] = prd.t.map(lambda x: hd.str_stem(x)).str.split()
prd.reset_index(drop=True, inplace=True)

name = 'MFG Brand Name'
pid = prd.pid.tolist()
att = hd.load_att()
att = att[att.name == name]
del att['name']
att.columns = [['pid', name]]
att = att[att.pid.isin(pid)]
att[name] = att[name].map(lambda x: hd.str_stem(x))

prd = pd.merge(prd, att, on='pid', how='left')

prd = prd.drop_duplicates(subset='pid')
prd.reset_index(drop=True, inplace=True)
        
    if prd['att_material'][i] != -1:
        mate += prd['att_material'][i]
        
    li_color.append(' '.join(color))
    li_power.append(' '.join(power))
    li_mate.append(' '.join(mate))
    
prd['att_color'] = li_color
prd['att_power'] = li_power
prd['att_material'] = li_mate
prd.drop(['Color','Color Family','Color/Finish','Finish','Finish Family',
          'Mount Type','Fuel Type','Power Type','Bulb Type'],axis=1, 
          inplace=True)

prd['att_material'] = prd.att_material.map(lambda x:hd.str_stem(x))

hd.out(prd[['pid','att_color','att_power','att_material']],'02att_color_power_material')












Ejemplo n.º 8
0
@author: Kazuki
"""

import homedepot as hd  #comment out syn_tbl
import numpy as np
import pandas as pd
import hdpath as hdp

tt, prd = hd.load_all(onlytrain=True, mk_csv=True)

hd.out(tt, 'tt')
hd.out(prd, 'prd')

po, ps = hdp.load()
tt = pd.read_csv(ps + 'tt.csv')
tt['s_'] = tt.sfix.map(lambda x: hd.str_stem(x)).str.split()
tt['s_'] = tt.s_.map(lambda x: hd.fix_typo(x))
prd = pd.read_csv(ps + 'prd.csv')
prd['t_'] = prd.t.map(lambda x: hd.str_stem(x)).str.split()


def is_contain_w(li, w):
    if w in li:
        return True
    return False


def flaten_list(lili):
    li = []
    for i in lili:
        for j in i: