def predict_testdata(cat_info,dataroot): ## input should be a data frame ## output is a list of popular skus in the right order testdat = pd.read_csv(dataroot) num_samples = len(testdat) skulist = [] for i in xrange(num_samples): dat = testdat.iloc[i:i+1] ## output should be a data frame cat = dat['category'].iloc[0] catdat = preprocess(dat) try: catdic = cat_info[cat]['sku_info'] except KeyError: print "Category %s is unseen!" % str(cat) raise KeyError testfsets = get_test_featuresets(catdat,catdic) cls = cat_info[cat]['cls'] yclasses = cls.classes_ yall = cls.predict_proba(testfsets) ysort = np.argsort(-yall) n = 5 try: ybest = ysort[:,:n] ## get the most frequent n except IndexError: try: ybest = ysort[:,:len(yclasses)] ## only one class except IndexError: ybest = ysort # if ysort is shorter than n, get ysort yout = yclasses[ybest] skulist.append(yout.flatten().tolist()) return skulist
def main(): start = timeit.default_timer() print "read train data" dataroot = "../data/train.csv" gcat_dic = groupByCat(dataroot) cat_list = gcat_dic.keys() #################### ## preprocess the data #################### #################### ## feature selections #################### cat_info = dict() for cat in cat_list: #print "preprocessing data" catdat = preprocess(gcat_dic[cat]) #print "feature selections" sku_info,fset,skus = getFeatureSet(catdat) cat_info[cat] = {} cat_info[cat]['sku_info'] = sku_info ##### method 1 ##### ## choose the most frequent 5 skus ##### method 2 ##### ## or choose the skus with frequency > n, n is user specified ## the final feature set is a matrix X, (n_samples, n_skus) ## and a column of sku, (n_skus) #################### ## train NB classifiers #################### #print "training data" cls = naive_bayes.MultinomialNB(alpha=0.1) cls.fit(fset,skus) cat_info[cat]['cls'] = cls # ### release the memory # gcat_dic = dict() #################### ## predict #################### ## preprocess test data print "read test data" dataroot = "../data/test_part.csv" skulist = predict_testdata(cat_info,dataroot) #skulist = predict_testdata_bycat(cat_info,dataroot) print skulist ## predict by nb_dic #################### ## compute elapsed CPU time #################### stop = timeit.default_timer() print 'time is', stop - start