def main():
    train_file ="/Users/phx/downloads/competetion/recipe/train.json"
    with open(train_file) as file:
        data = json.load(file)
    print("size of dataset %d" % len(data))

    data = preprocess(data)
    data = preprocess(data)

    train_data = [data[i] for i in xrange(0,len(data)) if i%3 !=0]
    test_data = [data[i] for i in xrange(0,len(data)) if i%3 ==0]

    #test_data= preprocess(test_data)

    attribute_map = getAttributeMap(train_data,1)

    print('attribute number : %d' % len(attribute_map))
    print(attribute_map)


    label_map = getLabelMap(data)
    print('label number : %d' %len(label_map))
    print(label_map)
    X,y = getDataSet(train_data,attribute_map,label_map)
    testX,testY= getDataSet(test_data,attribute_map,label_map)
    sgd = SGDClassifier(loss='log')
    generate_save_proba(sgd,X,y,testX,testY,"SGDClassifier.loss_log")
    mnb = MultinomialNB(alpha=0.08, class_prior=None, fit_prior=True)
    generate_save_proba(mnb,X,y,testX,testY,"MultinomialNB.alpha_0.08")
    rf = RandomForestClassifier(n_estimators=500)
    generate_save_proba(rf,X,y,testX,testY,"RandomForestClassifier.n_estimators_500")
    """
        else:
            categorized_data[record["cuisine"]].append(record)
    result = []
    for key, value in categorized_data.iteritems():
        # print("%s : %d" % (key, len(value)))
        if len(value) <= 500:
            result.extend(value)
        else:
            result.extend(value[:501])
    return result


# train_data = balanceData(train_data)


X, y = getDataSet(train_data, attribute_map, label_map)

testX, testY = getDataSet(test_data, attribute_map, label_map)


def benchmark(clf, X, y, testX, testY):
    print("_" * 80)
    print("training")
    print(clf)
    from time import time

    t0 = time()
    clf.fit(X, y)
    print("training time: %0.3fs" % (time() - t0))
    t0 = time()
    pred = clf.predict(testX)