def train_custom_one_vs_all(X_train, X_test, Y_train, topk):

    #convert matrix to row for efficient splicing
    Y_train = Y_train.tocsc()
    tag_classifiers = []
    num_training, numclasses = Y_train.shape
    num_test_examples = X_test.shape[0]

    # hold a vector mxk, containing top k prediction classes for each example, maintain m heaps for that
    num_examples = X_test.shape[0]
    num_classes = len(tag_classifiers)
    topk_class_distances = []
    for i in xrange(num_examples):
        heap = []
        topk_class_distances += [heap]

    for j in xrange(numclasses):
        # train on each class label for all the training examples
        y = numpy.ravel(Y_train.getcol(j).todense())

        clf = LogisticRegression(penalty='l2',
                                 dual=False,
                                 tol=0.0001,
                                 C=0.8,
                                 fit_intercept=True,
                                 intercept_scaling=1)

        clf.fit(X_train, y)
        print "Trained for class", j
        # get the decision for all test examples
        decision = clf.densify().decision_function(X_test)
        # for each test example add its decision value to the heap of top k decision values
        for i in xrange(num_test_examples):
            h = topk_class_distances[i]
            if len(h) < topk: heapq.heappush(h, (decision[i], j))
            else: heapq.heappushpop(h, (decision[i], j))
        print "Predicted for class", j

    #clean the decision values and store the class labels
    class_label_indices = []
    for i in xrange(num_examples):
        topk_labels = [label for dist, label in topk_class_distances[i]]
        class_label_indices += [topk_labels]

    return class_label_indices
def train_custom_one_vs_all(X_train,X_test,Y_train,topk):

    #convert matrix to row for efficient splicing
    Y_train = Y_train.tocsc()
    tag_classifiers = []
    num_training,numclasses = Y_train.shape
    num_test_examples = X_test.shape[0]


    # hold a vector mxk, containing top k prediction classes for each example, maintain m heaps for that
    num_examples = X_test.shape[0]
    num_classes = len(tag_classifiers)
    topk_class_distances = []
    for i in xrange(num_examples):
        heap = []
        topk_class_distances += [heap]
    

    for j in xrange(numclasses):
        # train on each class label for all the training examples
        y = numpy.ravel(Y_train.getcol(j).todense());

        clf = LogisticRegression(penalty='l2',dual=False,tol=0.0001,C=0.8,fit_intercept=True,intercept_scaling=1)
    
        clf.fit(X_train,y);
        print "Trained for class",j
        # get the decision for all test examples
        decision = clf.densify().decision_function(X_test)
        # for each test example add its decision value to the heap of top k decision values
        for i in xrange(num_test_examples):
            h = topk_class_distances[i]
            if len(h) < topk: heapq.heappush(h,(decision[i],j))
            else:             heapq.heappushpop(h,(decision[i],j))
        print "Predicted for class",j

    #clean the decision values and store the class labels
    class_label_indices = []
    for i in xrange(num_examples):
        topk_labels = [label for dist,label in topk_class_distances[i]]
        class_label_indices += [topk_labels]

    return class_label_indices

clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                              random_state=0)
clf.fit(X_train, y_train)
print("Accuracy ",clf.score(X_test, y_test))

# Model Building
#using logistic regression
print("TRAINING PHASE")
logit = LogisticRegression()	
logit.fit(X_train, y_train)
print("Accuracy ",logit.score(X_test, y_test))
print("coefficient :\n",logit.coef_)
print("Intercept:\n",logit.intercept_)
print(logit.densify())
print(logit.sparsify())
url_list = list(url_list)






print("TESTING PHASE")
X_predict = ["8.8.8.8"]
with open('logit.pickle', 'wb') as handle:
    pickle.dump(logit, handle, protocol=2)
with open('vectorizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=2)
    solver='liblinear',  # for use with small datasets
    multi_class='ovr')  # stating this is a binary problem)

# training the model
clf.fit(x_train, y_train)

# attributes
classes = clf.classes_  # list of class labels
coeff = clf.coef_  # coefficients of the model
intercept = clf.intercept_  # the intercept of the model
n_iter = clf.n_iter_  # the number of iterations for each class - in the binary case it only returns one value

# now having a look at the methods
dec_func = clf.decision_function(
    x_test)  # the confidence score for each test data
density = clf.densify()  # returns the coeffient matrix in densy array format
get_param = clf.get_params()  # returns the hyper-parameters
predicted_array = clf.predict(
    x_test
)  # running the test dataset through the model, giving an array of predicted values
predic_log_proba = clf.predict_log_proba(
    x_test)  # log of probability estimate for each class
predic_prob = clf.predict_proba(x_test)  # the probability for each class
mean_accuracy = clf.score(x_test,
                          y_test)  # returns the mean accuracy of the test set
sparsify = clf.sparsify()  # returns the coeffient matrix in sparse format

print('The mean accuracy of the test set is: %.3f' % mean_accuracy)

# now findng the confusion matrix for the data
# we first need to convert the 1 and 2 to 'female' and 'male'