def main(): train_file ="/Users/phx/downloads/competetion/recipe/train.json" with open(train_file) as file: data = json.load(file) print("size of dataset %d" % len(data)) data = preprocess(data) data = preprocess(data) train_data = [data[i] for i in xrange(0,len(data)) if i%3 !=0] test_data = [data[i] for i in xrange(0,len(data)) if i%3 ==0] #test_data= preprocess(test_data) attribute_map = getAttributeMap(train_data,1) print('attribute number : %d' % len(attribute_map)) print(attribute_map) label_map = getLabelMap(data) print('label number : %d' %len(label_map)) print(label_map) X,y = getDataSet(train_data,attribute_map,label_map) testX,testY= getDataSet(test_data,attribute_map,label_map) sgd = SGDClassifier(loss='log') generate_save_proba(sgd,X,y,testX,testY,"SGDClassifier.loss_log") mnb = MultinomialNB(alpha=0.08, class_prior=None, fit_prior=True) generate_save_proba(mnb,X,y,testX,testY,"MultinomialNB.alpha_0.08") rf = RandomForestClassifier(n_estimators=500) generate_save_proba(rf,X,y,testX,testY,"RandomForestClassifier.n_estimators_500") """
else: categorized_data[record["cuisine"]].append(record) result = [] for key, value in categorized_data.iteritems(): # print("%s : %d" % (key, len(value))) if len(value) <= 500: result.extend(value) else: result.extend(value[:501]) return result # train_data = balanceData(train_data) X, y = getDataSet(train_data, attribute_map, label_map) testX, testY = getDataSet(test_data, attribute_map, label_map) def benchmark(clf, X, y, testX, testY): print("_" * 80) print("training") print(clf) from time import time t0 = time() clf.fit(X, y) print("training time: %0.3fs" % (time() - t0)) t0 = time() pred = clf.predict(testX)