예제 #1
0
def PerformCV(qid_file,diff_feat_dir,feat_file,labels_file_dir,cnt_noisy_labels_file,batch_size=5):
   """
   Given the list of qids, it takes the training features from diff_feat folder per noisy annotator,
   test and dev features from feat_file.
   Model is trained on diff_feat_dir files and evaluated on feat_file features for Kendall tau

   Make sure that the sum of split ratio is a divisor of number of unique qids
   """

   qids = numpy.genfromtxt(qid_file)
   qids_unique = numpy.unique(qids)

   features = numpy.genfromtxt(feat_file,delimiter=',')

   # creating labels list
   true_labels = numpy.genfromtxt(labels_file,delimiter=',') 

   if numpy.remainder(len(qids_unique),batch_size):
      print "Please provide a split that divides number of unique qids"
      return

   num_batches = len(qids_unique)/batch_size

   all_test_scores, all_dev_scores = numpy.empty([0,1]), numpy.empty([0,1])
   all_test_labels, all_dev_labels = numpy.empty([0,1]), numpy.empty([0,1])
   for i in range(num_batches):
      # Determine the qids in test, dev and train sets
      test_id = i 
      test_batch_qids = qids_unique[numpy.arange(batch_size*test_id,batch_size*(test_id+1))]

      dev_id = numpy.remainder(i+1,num_batches)
      dev_batch_qids = qids_unique[numpy.arange(batch_size*dev_id,batch_size*(dev_id+1))]
      
      train_batch_qids = numpy.setdiff1d(qids_unique,numpy.union1d(test_batch_qids,dev_batch_qids))  
   
      # find the features and labels for the train and the dev set
      test_features = features[numpy.in1d(qids,test_batch_qids).T,:] 
      test_labels = numpy.matrix(true_labels[numpy.in1d(qids,test_batch_qids)]).T
      dev_features = features[numpy.in1d(qids,dev_batch_qids),:] 
      dev_labels = numpy.matrix(true_labels[numpy.in1d(qids,dev_batch_qids)]).T

      # get all train set features together from all the annotators
      w_per_annt = numpy.empty((cnt_noisy_labels_file,0)).tolist() 
      print 'training model for iter ... %d' % (i)
      for noisy_annt_id in range(cnt_noisy_labels_file):
         print 'at noisy annotor id: %d' %(noisy_annt_id)
         train_diff_features = numpy.empty([0,test_features.shape[1]]) 
         for train_batch_qid in train_batch_qids:
            feature_diff_file = diff_feat_dir + '/noisy_labels' + str(noisy_annt_id+1) + '/' + str(int(train_batch_qid)) + '.features' 
            feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',')
            train_diff_features = numpy.vstack((train_diff_features,feature_diff))
      
         w = numpy.ones((1,1+train_diff_features.shape[1])) # initial w
         for iter in range(1):
            n_epochs, learning_rate, lambda_w = 2400, .02, .001
            w = SVRanker3.svr_optimization(train_diff_features,w,learning_rate,n_epochs,lambda_w)		

         w_per_annt[noisy_annt_id] = w
   
      print 'Model Trained.'
      print 'Results:'
      # perform unweighted fusion to get results on test and dev set
      test_features_ext = numpy.hstack((test_features,numpy.ones((test_features.shape[0],1))))
      test_scores = 0
      dev_features_ext = numpy.hstack((dev_features,numpy.ones((dev_features.shape[0],1))))
      dev_scores = 0
      for noisy_annt_id in range(cnt_noisy_labels_file):
         cur_test_scores = numpy.dot(test_features_ext,w_per_annt[noisy_annt_id].T)
         print 'Annotator specific results for annotator %d' % (noisy_annt_id)
         print 'TEST: Kendall Tau: %f, Spearman correlation: %f, Pearson correlation: %f' \
         %(stats.kendalltau(cur_test_scores,test_labels)[0], stats.spearmanr(cur_test_scores,test_labels)[0], \
         numpy.corrcoef(cur_test_scores.T,test_labels.T)[0,1]) 
         test_scores += numpy.dot(test_features_ext,w_per_annt[noisy_annt_id].T)

         cur_dev_scores = numpy.dot(dev_features_ext,w_per_annt[noisy_annt_id].T)
         print 'DEV: Kendall Tau: %f, Spearman correlation: %f, Pearson correlation: %f' \
         %(stats.kendalltau(cur_dev_scores,dev_labels)[0], stats.spearmanr(cur_dev_scores,dev_labels)[0], \
         numpy.corrcoef(cur_dev_scores.T,dev_labels.T)[0,1]) 
         dev_scores += numpy.dot(dev_features_ext,w_per_annt[noisy_annt_id].T)
         
      print ''
      print 'Borda count results'
      print 'TEST: Kendall Tau: %f, Spearman correlation: %f, Pearson correlation: %f' \
      %(stats.kendalltau(test_scores,test_labels)[0], stats.spearmanr(test_scores,test_labels)[0], \
      numpy.corrcoef(test_scores.T,test_labels.T)[0,1]) 

      print 'DEV: Kendall Tau: %f, Spearman correlation: %f, Pearson correlation: %f' \
      %(stats.kendalltau(dev_scores,dev_labels)[0], stats.spearmanr(dev_scores,dev_labels)[0], \
      numpy.corrcoef(dev_scores.T,dev_labels.T)[0,1]) 
예제 #2
0
def TrainOnAll(qid_file, diff_feat_dir, feat_file, true_labels_file, noisy_labels_dir, batch_size, count_annts):
#"""
#diff_feat_dir: directory where the diff features are stored
#noisy_labels_dir: directory where the noisy labels are stored
#true_labels_file: the true labels file used for test set evaluation
#qid_file: the qid file location
#noisy_labels_dir: directory containing noisy preferences corresponding to
#features in the diff_feat_dir
#count_annts: number of annotators
#
#Example values:
#qid_file= '/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/qids'
#diff_feat_dir='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/noisy_features/labels/'
#feat_file='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/features'
#true_labels_file='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/labels'
#noisy_labels_dir='/auto/rcf-proj/pg/guptarah/RankingExp/data/wine_quality/noisy_labels_pairwise'
#batch_size=1
#count_annts=6
#"""

   qids = numpy.genfromtxt(qid_file,dtype='int')
   qids_unique = numpy.unique(qids)

   features = numpy.genfromtxt(feat_file,delimiter=',')
   labels = numpy.genfromtxt(true_labels_file,delimiter=',')

   if numpy.remainder(len(qids_unique),batch_size):
      print "Please provide a split that divides number of unique qids"
      return

   num_batches = 1 
   mean_result_storage = numpy.zeros((2,5+count_annts))
   # 3 for True, EM, EMRelEst, Borda results and Majority vote and other for each annotator
   # 2 rows: first for dev, second for dev
   
   for i in range(num_batches):
      train_batch_qids = qids_unique  
      # get all train set features together
      train_diff_features = numpy.empty([0,features.shape[1]])
      annt_labels = numpy.empty([count_annts,0]).tolist()
      for train_batch_qid in train_batch_qids:
         feature_diff_file = diff_feat_dir + '/labels/' + str(int(train_batch_qid)) + '.features' 
         feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',')
         train_diff_features = numpy.vstack((train_diff_features,feature_diff))

         # getting the labels on train set from different annotators
         for annt_id in range(count_annts):
            cur_annt_labels = annt_labels[annt_id]
            annt_lables_for_qid_file = noisy_labels_dir + '/' + str(train_batch_qid) + '.noisy_labels' + str(annt_id+1)
            annt_lables_for_qid = numpy.genfromtxt(annt_lables_for_qid_file)
            cur_annt_labels = numpy.hstack((cur_annt_labels,annt_lables_for_qid))
            annt_labels[annt_id] = cur_annt_labels

      ext_diff_feats = numpy.hstack((train_diff_features,numpy.ones((train_diff_features.shape[0],1))))
      max_iter = 20
      w,k = TrainEM.TrainModel(ext_diff_feats,annt_labels,max_iter)
      print 'TrainEM Results:'
      print 'Correct identifications on train set: %f' %(numpy.mean(k>.5))

      print '-----------------------------'
      print ''

      # Training model using TrainEMRelEst function
      w,k = TrainEMRelEst.TrainModel(ext_diff_feats,annt_labels,max_iter)
      print 'TrainEMRelEst Results:'
      print 'Correct identifications on train set: %f' %(numpy.mean(k>.5)+(.5*numpy.mean(k==.5)))

      print '-----------------------------'
      print ''

      # Getting results using majority vote 
      majority_vote = (((numpy.mean(numpy.matrix(annt_labels),axis=0) > .5)+.5*(numpy.mean(numpy.matrix(annt_labels),axis=0) == .5))*1).T
      print 'Majority vote Results:'
      print 'Correct identifications on train set: %f' %(numpy.mean(majority_vote))
      
 
      print '-----------------------------'
      print ''
      
      print 'Getting results for each annotator'
      # Getting results on each annotator
      w_borda = numpy.zeros((1,1+train_diff_features.shape[1]))
      for noisy_annt_id in range(count_annts):
         print 'at noisy annotor id: %d' %(noisy_annt_id)
         train_diff_features = numpy.empty([0,features.shape[1]])
         for train_batch_qid in train_batch_qids:
            feature_diff_file = diff_feat_dir + '/noisy_labels' + str(noisy_annt_id+1) + '/' + str(int(train_batch_qid)) + '.features'
            feature_diff = numpy.genfromtxt(feature_diff_file,delimiter=',')
            train_diff_features = numpy.vstack((train_diff_features,feature_diff)) 
         
         w = numpy.ones((1,1+train_diff_features.shape[1])) # initial w
         n_epochs, learning_rate, lambda_w = 2000, .02, .001
         w = SVRanker3.svr_optimization(train_diff_features,w,learning_rate,n_epochs,lambda_w)
         print 'Annotator %d results:' %(noisy_annt_id)
         PrintResults(w,train_diff_features)
 
         print 'Annotator correct identification:', numpy.mean(annt_labels[noisy_annt_id])
 
         w_borda = w_borda + w/numpy.linalg.norm(w,2) 

      print 'Borda count Results:' 
      PrintResults(w_borda,train_diff_features)