def main():
    train_file ="/Users/phx/downloads/competetion/recipe/train.json"
    with open(train_file) as file:
        data = json.load(file)
    print("size of dataset %d" % len(data))

    data = preprocess(data)
    data = preprocess(data)

    train_data = [data[i] for i in xrange(0,len(data)) if i%3 !=0]
    test_data = [data[i] for i in xrange(0,len(data)) if i%3 ==0]

    #test_data= preprocess(test_data)

    attribute_map = getAttributeMap(train_data,1)

    print('attribute number : %d' % len(attribute_map))
    print(attribute_map)


    label_map = getLabelMap(data)
    print('label number : %d' %len(label_map))
    print(label_map)
    X,y = getDataSet(train_data,attribute_map,label_map)
    testX,testY= getDataSet(test_data,attribute_map,label_map)
    sgd = SGDClassifier(loss='log')
    generate_save_proba(sgd,X,y,testX,testY,"SGDClassifier.loss_log")
    mnb = MultinomialNB(alpha=0.08, class_prior=None, fit_prior=True)
    generate_save_proba(mnb,X,y,testX,testY,"MultinomialNB.alpha_0.08")
    rf = RandomForestClassifier(n_estimators=500)
    generate_save_proba(rf,X,y,testX,testY,"RandomForestClassifier.n_estimators_500")
    """
print("size of dataset %d" % len(data))

data = preprocess(data)

train_data = [data[i] for i in xrange(0, len(data)) if i % 3 != 0]
test_data = [data[i] for i in xrange(0, len(data)) if i % 3 == 0]

# test_data= preprocess(test_data)

attribute_map = getAttributeMap(train_data, 1)

print("attribute number : %d" % len(attribute_map))
print(attribute_map)


label_map = getLabelMap(data)
print("label number : %d" % len(label_map))
print(label_map)


def balanceData(data):
    categorized_data = {}
    for record in data:
        if not categorized_data.get(record["cuisine"]):
            categorized_data[record["cuisine"]] = [record]
        else:
            categorized_data[record["cuisine"]].append(record)
    result = []
    for key, value in categorized_data.iteritems():
        # print("%s : %d" % (key, len(value)))
        if len(value) <= 500: