def test(X, y, learned_params):
    
    N = np.shape(X)[0] #no of instances
    X = np.append(np.ones((N,1)), X,1) #appending a column of ones as bias (used in logistic regression weights prediction)
    F = np.shape(X)[1] #no of features+1
    
    
    class_prob = []
    for w in learned_params.keys():
        prob = Utils.logistic_transformation(learned_params[w], X)
        class_prob.append(prob)
    
    max_prob = np.max(class_prob, 0)
    
    predicted_y = []
    output_label = range(min_class_label, max_class_label+1)
    for i in xrange(np.size(max_prob)):
            class_label = np.where(class_prob == max_prob[i])[0]
            predicted_y.append(output_label[class_label[0]])
    
    print "predicted y :", predicted_y
    print "Actual y:", y
    accuracy = Utils.calculate_accuracy(np.array(y), np.array(predicted_y))
    print "accuracy for test data :", accuracy
    f_score_mean, f_score_std = Utils.calculate_average_F1score(np.array(y), np.array(predicted_y), min_class_label, max_class_label)
    print "Average f score for test data :", f_score_mean
    
    error_rate = Utils.calculate_error_rate(np.array(y), np.array(predicted_y))
    #ch = stdin.read(1)
    return (accuracy, f_score_mean, f_score_std, error_rate)
Exemple #2
0
 def Estep(x, w, a, b):
     p = Utils.logistic_transformation(w, x)
     log_p_a =np.log(p) + np.log(a)
     log_p_ab = np.log(p*a + (1-p)*b)
     log_ycap = log_p_a - log_p_ab
     ycap = np.exp(log_ycap)
     return ycap
Exemple #3
0
def train(training_data, set_id):
    EM_result = {}
    Utils.initPlot( len(no_of_experts), set_id)
    for ex in xrange(len(no_of_experts)):
        #generate expert #self.Training_instances = self.N - self.Testing_instanceswrong percentage
        #60% good:40 % bad
        print "For group ", ex
        expert_wrong_percentage = []
        for i in xrange(int(no_of_experts[ex]*expert_bads)):
            #bads
            num = 0.90 #((random()%0.5) + 0.5) % 1.0
            expert_wrong_percentage.append(num)
        for i in xrange(int(no_of_experts[ex]*expert_goods)):
            #goods
            num = 0.20 #random()%0.5
            expert_wrong_percentage.append(num)                        
        crowds_EM = None    
        failed = 0
        iterations = 0
        total_iter = 10
        while iterations < total_iter :
            try:
                crowds_EM = Crowds_EM( training_data, min_class_label, max_class_label, expert_wrong_percentage, verbose= verbose_output, synthetic=synthetic_data)
                crowds_EM.run_EM_missing()
            except Exception,e:
                #Rerunning ...
                import traceback
                print traceback.print_exc()
                failed+=1
                try:
                    crowds_EM = Crowds_EM( training_data, min_class_label, max_class_label, expert_wrong_percentage, verbose= verbose_output, synthetic = synthetic_data)
                    crowds_EM.run_EM_missing()
                except Exception,e:
                    failed+=1
                    pass
                else :
                    EM_result[ex] = crowds_EM.results
                    break
            else :
                EM_result[ex] = crowds_EM.results
                break
            #print "iteration :" , iterations
            iterations+=1
def logistic_regression(x,y,beta_start=None,verbose=False,CONV_THRESH=1.e-3,
                        MAXIT=500):
    """
 Uses the Newton-Raphson algorithm to calculate maximum
 likliehood estimates of a logistic regression.

 Can handle multivariate case (more than one predictor).

 x - 2-d array of predictors. Number of predictors = x.shape[0]=N
 y - binary outcomes (len(y) = x.shape[1])
 beta_start - initial beta vector (default zeros(N+1,x.dtype)
 if verbose=True, diagnostics printed for each iteration.
 MAXIT - max number of iterations (default 500)
 CONV_THRESH - convergence threshold (sum of absolute differences
  of beta-beta_old)

 returns beta (the logistic regression coefficients, a N+1 element vector),
 J_bar (the (N+1)x(N=1) information matrix), and l (the log-likeliehood).
 J_bar can be used to estimate the covariance matrix and the standard
 error beta.
 l can be used for a chi-squared significance test.

 covmat = inverse(J_bar)     --> covariance matrix
 stderr = sqrt(diag(covmat)) --> standard errors for beta
 deviance = -2l              --> scaled deviance statistic
 chi-squared value for -2l is the model chi-squared test.
    """
    if x.shape[-1] != len(y):
        raise ValueError, "x.shape[-1] and y should be the same length!"
    try:
        N, npreds = x.shape[1], x.shape[0]
    except: # single predictor, use simple logistic regression routine.
        N, npreds = x.shape[-1], 1
        return simple_logistic_regression(x,y,beta_start=beta_start,
               CONV_THRESH=CONV_THRESH,MAXIT=MAXIT,verbose=verbose)
    if beta_start is None:
        beta_start = np.zeros(npreds+1,x.dtype)
    X = np.ones((npreds+1,N), x.dtype)
    X[1:, :] = x
    Xt = np.transpose(X)

    iter = 0; diff = 1.; beta = beta_start  # initial values
    l = np.sum( y * -np.logaddexp(0, -1 * np.dot(beta, X)) + (1-y) * -np.logaddexp(0, 1 * np.dot(beta, X)))

    if verbose:
        print 'Logistic Regression : '
        print 'iteration  beta log-likliehood |log-log_old|'
    try:
        while iter < MAXIT:
            beta_old = beta
            l_old = l
            #ebx = np.exp(np.dot(beta, X))
            p = Utils.logistic_transformation(beta.T, X.T)
            p = p.T
            #p = ebx/(1.+ebx)
            #l = np.sum(y*np.log(p) + (1.-y)*np.log(1.-p)) # log-likeliehood
            #l = np.sum( y * -np.logaddexp(0, -1 * np.dot(beta, X)) + (1-y) * -np.logaddexp(0, 1 * np.dot(beta, X)))
            s = np.dot(X, y-p)                            # scoring function
            J_bar = np.dot(X*np.multiply(p,1.-p),Xt)      # information matrix
            #beta = beta_old + np.dot(np.linalg.inv(J_bar),s) # new value of beta
            beta = beta_old + invertAdotB(J_bar, s)
            #diff = np.sum(np.fabs(beta-beta_old)) # sum of absolute differences
            l = np.sum( y * -np.logaddexp(0, -1 * np.dot(beta, X)) + (1-y) * -np.logaddexp(0, 1 * np.dot(beta, X)))

            diff = np.sum(np.fabs(l - l_old))
            if verbose:
                print iter+1, beta, l, diff
            if diff <= CONV_THRESH and l>l_old: break
            iter = iter + 1
        if iter == MAXIT and diff > CONV_THRESH:
            print 'warning: convergence not achieved with threshold of %s in %s iterations' % (CONV_THRESH,MAXIT)
        return beta #, J_bar, l
    except Exception, e:
        #print "beta", beta
        #print "J_bar", J_bar
        #print "s", s
        #import traceback
        #print traceback.print_exc()
        raise
Exemple #5
0
    def run_EM_missing(self):
        try:
            for class_no in range(self.min_class_label, self.max_class_label+1):
                y_observed, experty_observed = self.binary_y_experty(class_no)
                #random initializations for this class label
                weights = np.random.random(self.F)
                alpha = np.random.random(self.E)  #expert sensitivity
                beta = np.random.random(self.E)   #expert specificity
                l =0
                iter = 0
                while iter < self.MAXITER:
                    # First iteration
                    if not iter:
                        l_old = 0
                        expertcombined = np.array([])
                        
                        for e in xrange(self.E):
                            experty_observed[e][experty_observed[e] == -1] = randrange(self.min_class_label,self.max_class_label+1)
                            #self.Training_experty[e] = self.experty[e][:self.Training_instances]
  
                        
                        for e in experty_observed:
                            expertcombined = np.append(expertcombined,experty_observed[e], axis=0)
                        
                        
                        
                        expertcombined = np.reshape(expertcombined, (self.E, self.Training_instances))
                        y_predicted = np.average( expertcombined, axis=0)
                        y_majority_voting = y_predicted.copy()
                        #acc_MV = np.size(np.where((y_majority_voting.round())==y_observed))/float(self.Training_instances)
                        self.results['weights_mv'][class_no] = NR.logistic_regression(self.Training_x[:,1:].T,np.asarray(y_majority_voting).reshape(-1),verbose=False, MAXIT=10000)
                        self.results['weights_at'][class_no] = NR.logistic_regression(self.Training_x[:,1:].T,np.asarray(y_observed).reshape(-1),verbose=False, MAXIT=10000)

                    else :
                        l_old = l
                        w_old = weights
                        alpha_old = alpha
                        beta_old = beta
    
                        experty_learnt = self.learn_experty_missing(alpha_old, beta_old, y_observed)

                        for e in experty_observed:
                            missing_ids = np.where(self.experty[e] == -1)
                            for m in missing_ids:
                                experty_observed[e][m] = experty_learnt[e][m]
                        #print "experty :"
                        #pprint(experty_observed)
                        a = Utils.a_calculations(alpha_old, experty_observed,self.y_shape)
                        b = Utils.b_calculations(beta_old, experty_observed, self.y_shape)
                        # E-step
                        y_predicted = EM.Estep(self.Training_x, w_old, a, b)
                        y_predicted = np.asarray(y_predicted).reshape(-1)
    
                    # M-step
                    weights, alpha, beta = EM.Mstep(self.Training_x, y_predicted, experty_observed)
                    a = Utils.a_calculations(alpha, experty_observed, self.y_shape)
                    b = Utils.b_calculations(beta, experty_observed, self.y_shape)
    
                    l = self.calculate_loglikelihood(y_predicted, weights, a, b)
                    #acc_EM = np.size(np.where(y_observed==y_predicted.round()))/float(self.Training_instances)
                    diff =  np.fabs(l-l_old)
                    if diff <= self.CONV_THRESH and l>=l_old : break
                    iter = iter+1
                    if self.verbose:
                        print "EM algorithm :","diff:",diff,"log:", l, "iteration:", iter
    
                self.results['weights'][class_no] = weights
                self.results['alpha'][class_no] = alpha.round(1)
                self.results['beta'][class_no] = beta.round(1)
                """self.results['loglikelihood'][class_no] = l
                self.results['EM_perf']['f1_Score'][class_no] = Utils.calculate_F1score(y_observed, y_predicted)
                self.results['MV_perf']['f1_Score'][class_no] = Utils.calculate_F1score(y_observed, y_majority_voting)
                self.results['EM_perf']['rmse'][class_no] = Utils.calculate_RMSE(y_observed, y_predicted)
                self.results['MV_perf']['rmse'][class_no] = Utils.calculate_RMSE(y_observed, y_majority_voting)
                self.results['experty'] [class_no] = experty_observed
                fig = plt.figure()
                ax = fig.add_subplot(111)
                ax.set_title('class :' + str(class_no))
                ax.set_ylim(-1,2)
                ax.plot(y_observed,'ro-',y_predicted,'-b.')"""
                
                if self.verbose:
                    print "alphacap :"
                    pprint (alpha.round(1))
                    print "betacap :"
                    pprint (beta.round(1))
                    print "weights :"
                    pprint (weights)
    
    
                    print "f1_Score of EM approach :"
                    print self.results['EM_perf']['f1_Score'][class_no]
    
                    print "f1_Score of majority voting approach :"
                    print self.results['MV_perf']['f1_Score'][class_no]
    
                    print "y"
                    print y_observed
                    print "y maj"
                    print y_majority_voting.round(2)
                    print "y pred"
                    print y_predicted.round(2)
    
                    print "Expert wrong percentage"
                    print self.expert_wrong_percentage
    
                    print '--'*30
     
        except Exception, e:
            raise
Exemple #6
0
    def run(self):
        try:
            for class_no in range(self.min_class_label, self.max_class_label+1):
                y_observed, experty_observed = self.binary_y_experty(class_no)
                #random initializations for this class label
                weights = np.random.random(self.F)
                alpha = np.random.random(self.E)  #expert sensitivity
                beta = np.random.random(self.E)   #expert specificity
                
                l =0
                iter = 0
                while iter < self.MAXITER:
                    # First iteration
                    if not iter:
                        l_old = 0
                        expertcombined = np.array([])
                        for e in experty_observed:
                            expertcombined = np.append(expertcombined,experty_observed[e], axis=0)
    
                        expertcombined = np.reshape(expertcombined, (self.E, self.Training_instances))
                        y_average = np.average( expertcombined, axis=0)
                        
                        
                        mv_expert_combined =  np.reshape(expertcombined,expertcombined.size,order='F').reshape(np.shape(expertcombined)[1],np.shape(expertcombined)[0])
                        y_predicted = np.array([])
                        
                        for emv in mv_expert_combined:
                            y_predicted = np.append(y_predicted, np.bincount(emv.astype(int)).argmax())
                        
                        y_predicted = y_predicted.astype(float)
                        """
                        Classifier with MV as input
                        """
                        self.results['weights_avg'][class_no] = NR.logistic_regression(self.Training_x[:,1:].T,np.asarray(y_average).reshape(-1),verbose=False, MAXIT=10000)
                        self.results['weights_mv'][class_no] = NR.logistic_regression(self.Training_x[:,1:].T,np.asarray(y_predicted).reshape(-1),verbose=False, MAXIT=10000)
                        self.results['weights_at'][class_no] = NR.logistic_regression(self.Training_x[:,1:].T,np.asarray(y_observed).reshape(-1),verbose=False, MAXIT=10000)
                        #acc_MV = np.size(np.where((y_majority_voting.round())==y_observed))/float(self.Training_instances)
                    else :
                        l_old = l
                        w_old = weights
                        alpha_old = alpha
                        beta_old = beta
    
                        a = Utils.a_calculations(alpha_old, experty_observed,self.y_shape)
                        b = Utils.b_calculations(beta_old, experty_observed, self.y_shape)
                        # E-step
                        y_predicted = EM.Estep(self.Training_x, w_old, a, b)
                        y_predicted = np.asarray(y_predicted).reshape(-1)
    
                    # M-step
                    weights, alpha, beta = EM.Mstep(self.Training_x, y_predicted, experty_observed)
                    a = Utils.a_calculations(alpha, experty_observed, self.y_shape)
                    b = Utils.b_calculations(beta, experty_observed, self.y_shape)
    
                    l = self.calculate_loglikelihood(y_predicted, weights, a, b)
                    #acc_EM = np.size(np.where(y_observed==y_predicted.round()))/float(self.Training_instances)
                    diff =  np.fabs(l-l_old)
                    if diff <= self.CONV_THRESH and l>=l_old : break
                    iter = iter+1
                    if self.verbose:
                        print "EM algorithm :","diff:",diff,"log:", l, "iteration:", iter
    
                self.results['weights'][class_no] = weights
                self.results['alpha'][class_no] = alpha.round(1)
                self.results['beta'][class_no] = beta.round(1)

                
                if self.verbose:
                    print "alphacap :"
                    pprint (alpha.round(1))
                    print "betacap :"
                    pprint (beta.round(1))
                    print "weights :"
                    pprint (weights)
    
    
                    """print "f1_Score of EM approach :"
                    print self.results['EM_perf']['f1_Score'][class_no]
    
                    print "f1_Score of majority voting approach :"
                    print self.results['MV_perf']['f1_Score'][class_no]"""
    
                    print "y"
                    print y_observed
                    print "y maj"
                    print y_average.round(2)
                    print "y pred"
                    print y_predicted.round(2)
    
                    print "Expert wrong percentage"
                    print self.expert_wrong_percentage
    
                    print '--'*30
 
        except Exception, e:
            raise
Exemple #7
0
                        
    print "Final results :\n"
    pprint(EM_result)
    #for e in xrange(len(no_of_experts)):
    #    Utils.visualize( EM_result[e], min_class_label, max_class_label, no_of_experts[e] )   
    return EM_result
    

def test(test_data):
    print "Test data:"
    pprint (test_data)
    
    
#k_fold_cross_validation()
train(data, 0)
Utils.showPlot()
        
"""else:
            EM_perf = crowds_EM.predict_EM(crowds_EM.x, crowds_EM.y)
            print "EM perf ", EM_perf 
            EM_acc += EM_perf
            if EM_perf > EM_highest_performance['accuracy']:
                EM_highest_performance['accuracy'] = EM_perf
                EM_highest_performance['results'] = crowds_EM.results
                
            MV_acc += crowds_EM.predict_MV(crowds_EM.x, crowds_EM.y)
            #np.save('X.npy', crowds_EM.x)"""
 

"""print "No. of failed iterations : ", failed    
print "Average EM accuracy after ", iterations," iter : ", EM_acc/(total_iter-failed)