def test_hard_vote():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    #train_probs = probs[0]
    test_probs = probs[1]
    print(len(test_probs))
    preds = [x.idxmax(1) for x in test_probs]
    pred = np.zeros(len(preds[0]),dtype=np.int8)
    print(len(pred))
    for i in range(len(preds[0])):
        votes = [p[i] for p in preds]
        print(votes)
        pred[i]= max(set(votes),key=votes.count)
        print(pred[i])
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(y)
    pred = le.inverse_transform(pred)

    print(metrics.accuracy_score(test_Y,pred))

    """
def predict():
    X,y,test_X,ids =get_predict_data()
    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="p_")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="p_")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    train_probs = probs[0]
    test_probs = probs[1]
    print(len(train_probs))
    for prob in train_probs:
        print(prob.shape)
        print(type(prob))

    train_attr = pd.concat(train_probs,axis=1)
    print(train_attr.shape)
    print(type(train_attr))

    test_attr = pd.concat(test_probs,axis=1)
    print(test_attr.shape)
    print(type(test_attr))

    clf = LogisticRegression()
    clf.fit(train_attr,y)
    pred=clf.predict(test_attr)
    result = pd.DataFrame({'id':ids,'cuisine':pred})
    result[['id','cuisine']].to_csv("av_submission.csv",index=False,cols=["id","cuisine"],engine='python')
def test_vote_soft():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    train_probs = probs[0]
    test_probs = probs[1]
    print(len(train_probs))
    for prob in train_probs:
        print(prob.shape)
        print(type(prob))
    #train_attr = reduce(lambda a,b:a+b,train_probs)
    test_attr = reduce(lambda a,b:a+b,test_probs)

    pred = test_attr.idxmax(1)
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(y)
    pred = le.inverse_transform(pred)

    print(metrics.accuracy_score(test_Y,pred))
def test():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    train_probs = probs[0]
    test_probs = probs[1]
    print(len(train_probs))
    for prob in train_probs:
        print(prob.shape)
        print(type(prob))

    train_attr = pd.concat(train_probs,axis=1)
    print(train_attr.shape)
    print(type(train_attr))

    test_attr = pd.concat(test_probs,axis=1)
    print(test_attr.shape)
    print(type(test_attr))


    #clf = LogisticRegression()
    #clf = svm.SVC()
    """
    params={'kernel':('rbf','linear','poly','sigmoid'),'C':[1,10]}
    clf = grid_search.GridSearchCV(svm.SVC(),params,cv=5)

    params={'penalty':('l1','l2'),'C':[1,10]}
    clf = grid_search.GridSearchCV(LogisticRegression(),params,cv=5)
    """
    #clf = SGDClassifier(loss="log")

    """
    params = {'loss':['hinge','log','modified_huber','squared_hinge','perceptron'],
              'penalty':['l1','l2','elasticnet'],
              'alpha':[0.0001,0.001,0.01,0.1]}
    clf = grid_search.GridSearchCV(SGDClassifier(),params,cv=5)
    """
    """
    clf.fit(train_attr,y)

    #print(clf.best_params_)
    pred = clf.predict(test_attr)
    print(clf)
    print(metrics.accuracy_score(test_Y,pred))

    """
    """
    clf = RandomForestClassifier(n_estimators=50)
    benchmark(clf,train_attr,y,test_attr,test_Y)
    """
    """
    clf = GradientBoostingClassifier(n_estimators=50)
    benchmark(clf,train_attr,y,test_attr,test_Y)


    clf=DecisionTreeClassifier()

    benchmark(clf,train_attr,y,test_attr,test_Y)
    """
    """
    clf = AdaBoostClassifier(base_estimator=SGDClassifier(loss="log"))
    benchmark(clf,train_attr,y,test_attr,test_Y)


    clf = BaggingClassifier(LogisticRegression())
    benchmark(clf,train_attr,y,test_attr,test_Y)

    clf = LogisticRegression()
    benchmark(clf,train_attr,y,test_attr,test_Y)


    clf = SGDClassifier(loss="log")
    benchmark(clf,train_attr,y,test_attr,test_Y)

    clf=Perceptron()
    benchmark(clf,train_attr,y,test_attr,test_Y)
    clf =GaussianNB()
    benchmark(clf,train_attr,y,test_attr,test_Y)

    """
    """