def main(): train_file ="/Users/phx/downloads/competetion/recipe/train.json" with open(train_file) as file: data = json.load(file) print("size of dataset %d" % len(data)) data = preprocess(data) data = preprocess(data) train_data = [data[i] for i in xrange(0,len(data)) if i%3 !=0] test_data = [data[i] for i in xrange(0,len(data)) if i%3 ==0] #test_data= preprocess(test_data) attribute_map = getAttributeMap(train_data,1) print('attribute number : %d' % len(attribute_map)) print(attribute_map) label_map = getLabelMap(data) print('label number : %d' %len(label_map)) print(label_map) X,y = getDataSet(train_data,attribute_map,label_map) testX,testY= getDataSet(test_data,attribute_map,label_map) sgd = SGDClassifier(loss='log') generate_save_proba(sgd,X,y,testX,testY,"SGDClassifier.loss_log") mnb = MultinomialNB(alpha=0.08, class_prior=None, fit_prior=True) generate_save_proba(mnb,X,y,testX,testY,"MultinomialNB.alpha_0.08") rf = RandomForestClassifier(n_estimators=500) generate_save_proba(rf,X,y,testX,testY,"RandomForestClassifier.n_estimators_500") """
print("size of dataset %d" % len(data)) data = preprocess(data) train_data = [data[i] for i in xrange(0, len(data)) if i % 3 != 0] test_data = [data[i] for i in xrange(0, len(data)) if i % 3 == 0] # test_data= preprocess(test_data) attribute_map = getAttributeMap(train_data, 1) print("attribute number : %d" % len(attribute_map)) print(attribute_map) label_map = getLabelMap(data) print("label number : %d" % len(label_map)) print(label_map) def balanceData(data): categorized_data = {} for record in data: if not categorized_data.get(record["cuisine"]): categorized_data[record["cuisine"]] = [record] else: categorized_data[record["cuisine"]].append(record) result = [] for key, value in categorized_data.iteritems(): # print("%s : %d" % (key, len(value))) if len(value) <= 500: