Exemple #1
0
def one_vs_all(opt,x_test,y_test_ref,otimes_ref,boot=1,method='lr'):

  """
  Extract one class among the whole data.
  """

  from LR_functions import do_all_logistic_regression

  types = opt.types
  numt = opt.numt
  len_numt = len(numt)

  DIC = {}
  DIC['features'] = x_test.columns
  for b in range(boot):

    print "\n\tONE VS ALL EXTRACTION ------ iteration %d"%b

    dic = {}
    otimes = map(str,list(otimes_ref.values))
    otimes = np.array(otimes)

    y_train_ref = create_training_set(y_test_ref,numt)
    x_train = x_test.reindex(index=y_train_ref.index)
    dic["i_train"] = otimes[map(int,list(y_train_ref.index))]
    y_train_ref.index = range(y_train_ref.shape[0])
    x_train.index = range(x_train.shape[0])

    for n in range(len_numt):

      y_train = y_train_ref.copy()
      y_test = y_test_ref.copy()
      y_train[y_train_ref.Type==n] = 0
      y_test[y_test_ref.Type==n] = 0
      y_train[y_train_ref.Type!=n] = 1
      y_test[y_test_ref.Type!=n] = 1

      print y_train.shape[0], y_test.shape[0]

      print "----------- %s vs all -----------"%types[n]
      print_type = [types[n],'All']

      if method == 'lr':
        print "Logistic Regression\n"
        LR_train,theta,LR_test,wtr = do_all_logistic_regression(x_train,y_train,x_test,y_test,output=True)
      elif method == 'svm':
        kern = 'nonlinear'
        print "SVM\n"
        from sklearn.grid_search import GridSearchCV
        from sklearn import svm
        C_range = 10.0 ** np.arange(-2, 5)
        if kern == 'linear':
          param_grid = dict(C=C_range)
          grid = GridSearchCV(svm.LinearSVC(), param_grid=param_grid, n_jobs=-1)
        else:
          gamma_range = 10.0 ** np.arange(-3,3)
          param_grid = dict(gamma=gamma_range, C=C_range)
          grid = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=param_grid, n_jobs=-1)
        grid.fit(x_train.values, y_train.values.ravel())
        LR_train = grid.best_estimator_.predict(x_train)
        LR_test = grid.best_estimator_.predict(x_test)

      print "\t Training set"
      for i in range(2):
        print i, print_type[i], len(np.where(y_train.values[:,0]==i)[0]), len(np.where(LR_train==i)[0])
      print "\n"
      cmat_train = confusion(y_train,LR_train,types,'training','LogReg',plot=True,output=True)
      plt.close()

      print "\t Test set"
      for i in range(2):
        print i, print_type[i], len(np.where(y_test.values[:,0]==i)[0]), len(np.where(LR_test==i)[0])
      print "\n"
      cmat_test = confusion(y_test,LR_test,types,'test','LogReg',plot=True,output=True)
      plt.close()

      # Fill the dictionary
      sub_dic={}
      i_com = np.where((y_test.values.ravel()-LR_test)==0)[0]
      i_lr = np.where(LR_test==0)[0]
      i_ok_class = np.intersect1d(i_com,i_lr) # events classified in the class of interest by the LR and identical to the manual classification
      sub_dic["nb"] = len(i_lr) # total number of events classified in the class of interest
      sub_dic["nb_common"] = len(i_ok_class) # total number of well classified events
      sub_dic["index_ok"] = otimes[i_ok_class] # index of well classified events
      sub_dic["nb_other"],sub_dic["i_other"] = [],[]
      for k in range(len_numt):
        if k != n:
          i_other_man = list(y_test_ref[y_test_ref.Type==k].index)
          ii = np.intersect1d(i_lr,i_other_man)
          sub_dic["nb_other"].append((types[k],len(ii))) # number of events belonging to another class
          sub_dic["i_other"].append((types[k],otimes[ii])) # index of events belonging to another class
      sub_dic["rate_%s"%types[n]] = (cmat_train[0,0],cmat_test[0,0]) # % success rate of the extracted class
      sub_dic["rate_rest"] = (cmat_train[1,1],cmat_test[1,1]) # % success rate of the rest
      sub_dic["nb_manuals"] = ((types[n],len(y_test[y_test.Type==0])),('Rest',len(y_test[y_test.Type==1])))
      dic[types[n]] = sub_dic

    DIC[b] = dic

  file = opt.opdict['result_path']
  print "One-vs-All results stored in %s"%file
  opt.write_binary_file(file,DIC)
Exemple #2
0
def one_by_one(opt,x_test_ref0,y_test_ref0,otimes_ref,boot=1,method='lr'):

  """
  Extract one class after each other by order of importance. The events which are 
  classified are deleted from the next extraction.
  boot = number of training sets to be generated
  method = 'lr' for Logistic Regression / 'svm' for SVM
  """

  from LR_functions import do_all_logistic_regression

  types = opt.types
  numt = opt.numt

  len_numt = len(numt)
  # Dictionary for results
  DIC = {}
  DIC['features'] = x_test_ref0.columns 

  EXT = {}
  for num_ext in range(len_numt):
    EXT[num_ext] = {}
    EXT[num_ext]['nb_tot'] = []
    for t in numt:
      EXT[num_ext]['nb_%s'%types[t]] = []

  for b in range(boot):

    otimes = map(str,list(otimes_ref.values))
    otimes = np.array(otimes)

    x_test = x_test_ref0.copy()
    y_test_ref = y_test_ref0.copy()

    print "\n\tONE BY ONE EXTRACTION ------ iteration %d"%b
    dic = {}

    for n in range(len_numt):

      sub_dic={}
      y_train_ref = create_training_set(y_test_ref,numt)
      x_train = x_test.reindex(index=map(int,y_train_ref.index))
      sub_dic["i_train"] = otimes[map(int,list(y_train_ref.index))]
      y_train_ref.index = range(y_train_ref.shape[0])
      x_train.index = range(x_train.shape[0])

      if x_train.shape[0] != y_train_ref.shape[0]:
        print "Training set: Incoherence in x and y dimensions"
        sys.exit()

      if x_test.shape[0] != y_test_ref.shape[0]:
        print "Test set: Incoherence in x and y dimensions"
        sys.exit()

      if len(otimes) != len(y_test_ref):
        print "Warning !! Check lengths !"
        sys.exit()

      y_train = y_train_ref.copy()
      y_test = y_test_ref.copy()

      EXT[n]['nb_tot'].append(len(x_test))
      for t in numt:
        EXT[n]['nb_%s'%types[t]].append(len(y_test[y_test.Type==t]))

      y_train[y_train_ref.Type==n] = 0
      y_test[y_test_ref.Type==n] = 0
      y_train[y_train_ref.Type!=n] = 1
      y_test[y_test_ref.Type!=n] = 1

      t = [types[n],'Rest']
      print y_train.shape[0], y_test.shape[0]

      print "----------- %s vs all -----------"%types[n]

      if method == 'lr':
        print "Logistic Regression\n"
        LR_train,theta,LR_test,wtr = do_all_logistic_regression(x_train,y_train,x_test,y_test,output=True)
      elif method == 'svm':
        kern = 'nonlinear'
        print "SVM\n"
        from sklearn.grid_search import GridSearchCV
        from sklearn import svm
        C_range = 10.0 ** np.arange(-2, 5)
        if kern == 'linear':
          param_grid = dict(C=C_range)
          grid = GridSearchCV(svm.LinearSVC(), param_grid=param_grid, n_jobs=-1)
        else:
          gamma_range = 10.0 ** np.arange(-3,3)
          param_grid = dict(gamma=gamma_range, C=C_range)
          grid = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=param_grid, n_jobs=-1)
        grid.fit(x_train.values, y_train.values.ravel())
        LR_train = grid.best_estimator_.predict(x_train)
        LR_test = grid.best_estimator_.predict(x_test)

      print "\t Training set"
      for i in range(2):
        print i, t[i], len(np.where(y_train.values[:,0]==i)[0]), len(np.where(LR_train==i)[0])
      print "\n"
      cmat_train = confusion(y_train,LR_train,types,'training','LogReg',plot=False,output=True)

      print "\t Test set"
      for i in range(2):
        print i, t[i], len(np.where(y_test.values[:,0]==i)[0]), len(np.where(LR_test==i)[0])
      print "\n"
      cmat_test = confusion(y_test,LR_test,types,'test','LogReg',plot=False,output=True)

      # Fill the dictionary
      i_com = np.where((y_test.values.ravel()-LR_test)==0)[0]
      i_lr = np.where(LR_test==0)[0]
      i_ok_class = np.intersect1d(i_com,i_lr) # events classified in the class of interest by the LR and identical to the manual classification

      sub_dic["nb"] = len(i_lr) # total number of events classified in the class of interest
      sub_dic["nb_common"] = len(i_ok_class)
      sub_dic["index_ok"] = otimes[i_ok_class]
      sub_dic["nb_other"],sub_dic["i_other"] = [],[]
      for k in range(len_numt):
        if k != n:
          i_other_man = list(y_test_ref[y_test_ref.Type==k].index)
          ii = np.intersect1d(i_lr,i_other_man)
          sub_dic["nb_other"].append((types[k],len(ii)))
          sub_dic["i_other"].append((types[k],otimes[ii]))
      sub_dic["rate_%s"%types[n]] = (cmat_train[0,0],cmat_test[0,0])
      sub_dic["rate_rest"] = (cmat_train[1,1],cmat_test[1,1])
      sub_dic["nb_manuals"] = ((types[n],len(y_test[y_test.Type==0])),('Rest',len(y_test[y_test.Type==1])))

      i_ok = np.where(LR_test!=0)[0]
      y_test_ref = y_test_ref.reindex(index=i_ok)
      x_test = x_test.reindex(index=i_ok)
      otimes = otimes[i_ok]

      y_test_ref.index = range(y_test_ref.shape[0])
      x_test.index = range(x_test.shape[0])

      dic[types[n]] = sub_dic

    DIC[b] = dic

  file = opt.opdict['result_path']
  print "One-by-One results stored in %s"%file
  opt.write_binary_file(file,DIC)

  file = '%s/stats_OBO'%os.path.dirname(opt.opdict['result_path'])
  opt.write_binary_file(file,EXT)
Exemple #3
0
def classifier(opt):

  """
  Classification of the different types of events.
  opt is an object of the class Options()
  """

  opt.do_tri()
  X = opt.x
  Y = opt.y

  list_attr = opt.__dict__.keys()
  if 'train_x' in list_attr:
    X_TRAIN = opt.train_x
    Y_TRAIN = opt.train_y

  dic_results = {}
  for isc in sorted(opt.xs):

    print "==========",opt.trad[isc],"=========="
    subdic = {}

    if isc > 0:
      if opt.trad[isc][0] == sta_prev:
        marker_sta = 1
      else:
        marker_sta = 0
        sta_prev = opt.trad[isc][0]
    else:
      marker_sta = 0
      sta_prev = opt.trad[isc][0]

    if len(opt.xs[isc]) == 0:
      continue


    # About the training set
    if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1 and 'train_x' not in list_attr:
      if os.path.exists(opt.opdict['train_file']):
        TRAIN_Y = opt.read_binary_file(opt.opdict['train_file'])
      else:
        TRAIN_Y = []
    elif 'train_x' in list_attr:
      opt.x = opt.xs_train[isc]
      opt.y = opt.ys_train[isc]
      if opt.opdict['plot_pdf']:
        opt.compute_pdfs()
        g_train = opt.gaussians
        del opt.gaussians
      opt.classname2number()
      x_train = opt.x
      y_train = opt.y


    # About the test set
    opt.x = opt.xs[isc]
    opt.y = opt.ys[isc]
    if opt.opdict['plot_pdf']:
      opt.compute_pdfs()
 
    set = pd.DataFrame(index=opt.ys[isc].index,columns=['Otime'])
    set['Otime'] = opt.xs[isc].index

    opt.classname2number()
    x_test = opt.x
    y_ref = opt.y

    K = len(opt.types)

    for b in range(opt.opdict['boot']):
      print "\n-------------------- # iter: %d --------------------\n"%(b+1)

      subsubdic = {}

      y_test = y_ref.copy()

      if 'train_x' not in list_attr:
        x_train = x_test.copy()
        if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1:
          if len(TRAIN_Y) > b:
            y_train = y_ref.reindex(index=TRAIN_Y[b])
            y_train = y_train.dropna(how='any')
          else:
            y_train = create_training_set(y_ref,opt.numt)
            list_ev_train = y_train.index        
            TRAIN_Y.append(list(y_train.index))

        else:
          if marker_sta == 0:
            y_train = create_training_set(y_ref,opt.numt)
            list_ev_train = y_train.index
          else:
            y_train = y_ref.reindex(index=list_ev_train)
            y_train = y_train.dropna(how='any')

      y = y_train.copy()

      in_train = np.intersect1d(np.array(y_test.index),np.array(y_train.index))
      set[b] = np.zeros(set.shape[0])
      set[b][in_train] = 1

      y_train = y.copy()
      x_train = x_train.reindex(index=y_train.index)
      x_test = x_test.reindex(index=y_test.index)

      print "# types in the test set:",len(np.unique(y_test.values.ravel()))
      print "# types in the training set:",len(np.unique(y_train.values.ravel()))
      x_train.index = range(x_train.shape[0])
      y_train.index = range(y_train.shape[0])
      print x_train.shape, y_train.shape
      if x_train.shape[0] != y_train.shape[0]:
        print "Training set: Incoherence in x and y dimensions"
        sys.exit()

      subsubdic['list_ev'] = np.array(y_test.index)

      x_test.index = range(x_test.shape[0])
      y_test.index = range(y_test.shape[0])
      print x_test.shape, y_test.shape
      if x_test.shape[0] != y_test.shape[0]:
        print "Test set: Incoherence in x and y dimensions"
        sys.exit()

      if opt.opdict['plot_pdf']:
        if 'train_x' in list_attr:
          opt.plot_superposed_pdfs(g_train,save=opt.opdict['save_pdf'])
        else:
          opt.plot_all_pdfs(save=opt.opdict['save_pdf'])

      if opt.opdict['method'] == '1b1':
        # EXTRACTEURS
        print "********** EXTRACTION 1-BY-1 **********"
        one_by_one(opt,x_test,y_test,set['Otime'],boot=10,method='svm')
        continue

      elif opt.opdict['method'] == 'ova':
        print "********** EXTRACTION 1-VS-ALL **********"
        one_vs_all(opt,x_test,y_test,set['Otime'],boot=10,method='svm')
        continue

      elif opt.opdict['method'] == 'svm':
        # SVM
        print "********** SVM **********"
        CLASS_test, pourcentages = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='NonLin')

      elif opt.opdict['method'] == 'lrsk':
        # LOGISTIC REGRESSION (scikit learn)
        print "********* Logistic regression (sklearn) **********"
        CLASS_test, pourcentages = implement_lr_sklearn(x_train,x_test,y_train,y_test,opt.types,opt.opdict)

      elif opt.opdict['method'] == 'lr': 
        # LOGISTIC REGRESSION
        print "********* Logistic regression **********"
        from LR_functions import do_all_logistic_regression
        wtr = np.array([])
        if 'learn_file' in sorted(opt.opdict):
          if os.path.exists(opt.opdict['learn_file']):
            wtr = opt.read_binary_file(opt.opdict['learn_file'])
        CLASS_train,theta,CLASS_test,pourcentages,wtr = do_all_logistic_regression(x_train,y_train,x_test,y_test,output=True,perc=True,wtr=wtr)
        if 'learn_file' in sorted(opt.opdict):
          if not os.path.exists(opt.opdict['learn_file']):
            wtr = opt.write_binary_file(opt.opdict['learn_file'],wtr)
        print "\t Training set"
        for i in range(K):
          print i, opt.types[i], len(np.where(y_train.values[:,0]==i)[0]), len(np.where(CLASS_train==i)[0])
        print "\n"
        if opt.opdict['boot'] == 1:
          confusion(y_train,CLASS_train,opt.types,'Training','Logistic regression',plot=opt.opdict['plot_confusion'])
          if opt.opdict['plot_confusion'] and opt.opdict['save_confusion']:
            plt.savefig('%s/figures/training_%s_%s_%s.png'%(opt.opdict['outdir'],opt.opdict['result_file'][8:],opt.trad[isc][0],opt.trad[isc][1]))

        print "\t Test set"
        for i in range(K):
          print i, opt.types[i], len(np.where(y_test.values[:,0]==i)[0]), len(np.where(CLASS_test==i)[0])
        print "\n"
        if opt.opdict['boot'] == 1:
          confusion(y_test,CLASS_test,opt.types,'Test','Logistic regression',plot=opt.opdict['plot_confusion'])
          if opt.opdict['plot_confusion']:
            if opt.opdict['save_confusion']:
              plt.savefig('%s/figures/test_%s_%s_%s.png'%(opt.opdict['outdir'],opt.opdict['result_file'][8:],opt.trad[isc][0],opt.trad[isc][1]))
            plt.show()

      subsubdic['%'] = pourcentages
      trad_CLASS_test = []
      for i in CLASS_test:
        i = int(i)
        trad_CLASS_test.append(opt.types[i])
      subsubdic['classification'] = trad_CLASS_test
      subdic[b] = subsubdic

    dic_results[opt.trad[isc]] = subdic

  dic_results['header'] = {}
  dic_results['header']['features'] = opt.opdict['feat_list']
  dic_results['header']['types'] = opt.opdict['Types']
  dic_results['header']['catalog'] = opt.opdict['label_test']

  if opt.opdict['method'] == 'lr' or opt.opdict['method'] == 'lrsk' or opt.opdict['method'] == 'svm':
    opt.write_binary_file(opt.opdict['result_path'],dic_results)

  if 'train_file' in sorted(opt.opdict):
    if not os.path.exists(opt.opdict['train_file']) and opt.opdict['boot'] > 1:
      opt.write_binary_file(opt.opdict['train_file'],TRAIN_Y)
def one_vs_all(opt,x_test_ref,y_test_ref,otimes_ref,boot=1,method='lr'):

  """
  Extract one class among the whole data.
  One vs All extractor.
  """

  from LR_functions import do_all_logistic_regression

  types = opt.types
  numt = opt.numt
  len_numt = len(numt)

  DIC = {}
  DIC['features'] = x_test_ref.columns
  for b in range(boot):

    print "\n\tONE VS ALL EXTRACTION ------ iteration %d"%b

    dic = {}
    otimes = map(str,list(otimes_ref.values))
    otimes = np.array(otimes)

    ### Splitting of the whole set in training, CV and test sets ###
    y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_test_ref)
    i_train = y_train.index
    i_cv = y_cv.index
    i_test = y_test.index

    ### Defining the training set ###
    x_train = x_test_ref.reindex(index=y_train.index)
    y_train.index = range(y_train.shape[0])
    x_train.index = range(x_train.shape[0])
    dic["i_train"] = otimes[map(int,list(y_train.index))]

    ### Defining the test set ###
    x_test = x_test_ref.reindex(index=y_test.index)
    x_test.index = range(x_test.shape[0])
    y_test.index = range(y_test.shape[0])
    dic["i_test"] = otimes[map(int,list(y_test.index))]

    y_train_tir = y_train.copy()
    y_test_tir = y_test.copy()

    for n in range(len_numt):

      y_train[y_train_tir.NumType==n] = 0
      y_test[y_test_tir.NumType==n] = 0
      y_train[y_train_tir.NumType!=n] = 1
      y_test[y_test_tir.NumType!=n] = 1

      print y_train.shape[0], y_test.shape[0]

      print "----------- %s vs all -----------"%types[n]
      print_type = [types[n],'All']

      if method == 'lr':
        print "Logistic Regression\n"
        i_train = y_train.index
        i_cv = y_cv.index
        i_test = y_test.index
        out = do_all_logistic_regression(x_test_ref,y_test_ref,i_train,i_cv,i_test)
      elif method == 'svm':
        kern = 'NonLin'
        print "SVM\n"
        out = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern=kern)

      CLASS_test = out['label_test']
      CLASS_train = out['label_train']

      # TRAINING SET
      print "\t *TRAINING SET"
      y_train_np = y_train.NumType.values.ravel()  
      from sklearn.metrics import confusion_matrix
      cmat_train = confusion_matrix(y_train_np,CLASS_train)
      p_tr = dic_percent(cmat_train,[types[n],'Rest'],verbose=True)
      out['rate_train'] = p_tr
      print "   Global : %.2f%%"%p_tr['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_train,opt.types,'Training',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/training_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)

      # TEST SET
      print "\t *TEST SET"
      y_test_np = y_test.NumType.values.ravel()
      cmat_test = confusion_matrix(y_test_np,CLASS_test)
      p_test = dic_percent(cmat_test,[types[n],'Rest'],verbose=True)
      out['rate_test'] = p_test
      print "   Global : %.2f%%"%p_test['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_test,opt.types,'Test',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/test_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)
        if opt.opdict['plot_confusion']:
          plt.show()
        else:
          plt.close()


      # Fill the dictionary
      sub_dic={}
      i_com = np.where((y_test.NumType.values.ravel()-CLASS_test)==0)[0]
      i_lr = np.where(CLASS_test==0)[0]
      i_ok_class = np.intersect1d(i_com,i_lr) # events classified in the class of interest by the LR and identical to the manual classification
      sub_dic["nb"] = len(i_lr) # total number of events classified in the class of interest
      sub_dic["nb_common"] = len(i_ok_class) # total number of well classified events
      sub_dic["index_ok"] = otimes[i_ok_class] # index of well classified events
      sub_dic["nb_other"],sub_dic["i_other"] = [],[]
      for k in range(len_numt):
        if k != n:
          i_other_man = list(y_test_tir[y_test_tir.NumType==k].index)
          ii = np.intersect1d(i_lr,i_other_man)
          sub_dic["nb_other"].append((types[k],len(ii))) # number of events belonging to another class
          sub_dic["i_other"].append((types[k],otimes[ii])) # index of events belonging to another class
      sub_dic["rate_%s"%types[n]] = (out['rate_train'][('%s'%types[n], 0)], out['rate_test'][('%s'%types[n], 0)]) # % success rate of the extracted class
      sub_dic["rate_rest"] = (out['rate_train'][('Rest', 1)], out['rate_test'][('Rest', 1)]) # % success rate of the rest
      sub_dic["nb_manuals"] = ((types[n],len(y_test[y_test.NumType==0])),('Rest',len(y_test[y_test.NumType==1])))
      dic[types[n]] = sub_dic

    DIC[b] = dic

  file = opt.opdict['result_path']
  print "One-vs-All results stored in %s"%file
  write_binary_file(file,DIC)
def one_by_one(opt,x_test_ref0,y_test_ref0,otimes_ref,boot=1,method='lr'):

  """
  Per class extractor.
  Extract one class after each other by order of importance. The events which are 
  classified are deleted from the next extraction.
  boot = number of training sets to be generated
  method = 'lr' for Logistic Regression / 'svm' for SVM
  """

  from LR_functions import do_all_logistic_regression

  types = opt.types
  numt = opt.numt

  len_numt = len(numt)
  # Dictionary for results
  DIC = {}
  DIC['features'] = x_test_ref0.columns 

  EXT = {}
  for num_ext in range(len_numt):
    EXT[num_ext] = {}
    EXT[num_ext]['nb_tot'] = []
    for t in numt:
      EXT[num_ext]['nb_%s'%types[t]] = []

  p_train, p_cv, p_test = opt.opdict['proportions']

  for b in range(boot):

    otimes = map(int,list(otimes_ref.values))
    otimes = np.array(otimes)

    x_test_ref = x_test_ref0.copy()
    y_test_ref = y_test_ref0.copy()

    print "\n\tONE BY ONE EXTRACTION ------ iteration %d"%b
    dic = {}

    inum = 0
    for n in range(len_numt):

      sub_dic={}

      ### Splitting of the whole set in training, CV and test sets ###
      y_train_ref, y_cv, y_test_ref = generate_datasets(opt.opdict['proportions'],opt.numt,y_test_ref)
      y_test_ref = pd.concat([y_cv,y_test_ref])
      i_train = y_train_ref.index
      i_cv = y_cv.index
      i_test = y_test_ref.index

      ### Defining the training set ###
      x_train = x_test_ref.reindex(index=y_train_ref.index)
      y_train_ref.index = range(y_train_ref.shape[0])
      x_train.index = range(x_train.shape[0])
      if inum == 0:
        list_i_train = [list(otimes[map(int,list(y_train_ref.index))])]
      else:
        list_i_train.append(list(otimes[map(int,list(y_train_ref.index))]))

      ### Defining the test set ###
      x_test = x_test_ref.reindex(index=y_test_ref.index)
      x_test.index = range(x_test.shape[0])
      y_test_ref.index = range(y_test_ref.shape[0])
      if inum == 0:
        list_i_test = [list(otimes[map(int,list(y_test_ref.index))])]
      else:
        list_i_test.append(list(otimes[map(int,list(y_test_ref.index))]))

      if x_train.shape[0] != y_train_ref.shape[0]:
        print "Training set: Incoherence in x and y dimensions"
        sys.exit()

      if x_test.shape[0] != y_test_ref.shape[0]:
        print "Test set: Incoherence in x and y dimensions"
        sys.exit()

      y_train = y_train_ref.copy()
      y_test = y_test_ref.copy()

      EXT[n]['nb_tot'].append(len(x_test))
      for t in numt:
        EXT[n]['nb_%s'%types[t]].append(len(y_test[y_test.NumType==t]))

      y_train[y_train_ref.NumType==n] = 0
      y_test[y_test_ref.NumType==n] = 0
      y_train[y_train_ref.NumType!=n] = 1
      y_test[y_test_ref.NumType!=n] = 1

      t = [types[n],'Rest']
      print y_train.shape[0], y_test.shape[0]

      print "----------- %s vs all -----------"%types[n]

      if method == 'lr':
        print "Logistic Regression\n"
        out = do_all_logistic_regression(x_test_ref0,y_test_ref0,i_train,i_cv,i_test,)
      elif method == 'svm':
        kern = 'NonLin'
        print "SVM\n"
        out = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern=kern)

      CLASS_test = out['label_test']
      CLASS_train = out['label_train']

      # TRAINING SET
      print "\t *TRAINING SET"
      y_train_np = y_train.NumType.values.ravel()  
      from sklearn.metrics import confusion_matrix
      cmat_train = confusion_matrix(y_train_np,CLASS_train)
      p_tr = dic_percent(cmat_train,[types[n],'Rest'],verbose=True)
      out['rate_train'] = p_tr
      print "   Global : %.2f%%"%p_tr['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_train,opt.types,'Training',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/training_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)

      # TEST SET
      print "\t *TEST SET"
      y_test_np = y_test.NumType.values.ravel()
      cmat_test = confusion_matrix(y_test_np,CLASS_test)
      p_test = dic_percent(cmat_test,[types[n],'Rest'],verbose=True)
      out['rate_test'] = p_test
      print "   Global : %.2f%%"%p_test['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_test,opt.types,'Test',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/test_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)
        if opt.opdict['plot_confusion']:
          plt.show()
        else:
          plt.close()

      # Fill the dictionary
      i_com = np.where((y_test.NumType.values.ravel()-CLASS_test)==0)[0]
      i_lr = np.where(CLASS_test==0)[0]
      i_ok_class = np.intersect1d(i_com,i_lr) # events classified in the class of interest by the LR and identical to the manual classification

      sub_dic["nb"] = len(i_lr) # total number of events classified in the class of interest
      sub_dic["nb_common"] = len(i_ok_class)
      sub_dic["index_ok"] = otimes[i_ok_class]
      sub_dic["nb_other"],sub_dic["i_other"] = [],[]
      for k in range(len_numt):
        if k != n:
          i_other_man = list(y_test_ref[y_test_ref.NumType==k].index)
          ii = np.intersect1d(i_lr,i_other_man)
          sub_dic["nb_other"].append((types[k],len(ii)))
          sub_dic["i_other"].append((types[k],otimes[ii]))
      sub_dic["rate_%s"%types[n]] = (out['rate_train'][('%s'%types[n], 0)], out['rate_test'][('%s'%types[n], 0)])
      sub_dic["rate_rest"] = (out['rate_train'][('Rest', 1)], out['rate_test'][('Rest', 1)])
      sub_dic["nb_manuals"] = ((types[n],len(y_test[y_test.NumType==0])),('Rest',len(y_test[y_test.NumType==1])))

      i_ok_test = i_test[np.where(CLASS_test!=0)[0]]
      i_ok_train = i_train[np.where(CLASS_train!=0)[0]]
      i_ok = np.concatenate([i_ok_test,i_ok_train])
      otimes = i_ok
      y_test_ref = y_test_ref0.reindex(index=map(int,list(i_ok)))

      dic[types[n]] = sub_dic
      inum = inum + 1

    dic['i_train'] = list_i_train
    dic['i_test'] = list_i_test
    DIC[b] = dic

  file = opt.opdict['result_path']
  print "One-by-One results stored in %s"%file
  write_binary_file(file,DIC)

  file = '%s/stats_OBO'%os.path.dirname(opt.opdict['result_path'])
  write_binary_file(file,EXT)
def classifier(opt):
  """
  Classification of the different types of events.
  opt is an object of the class Options()
  """

  list_attr = opt.__dict__.keys()
  if not 'x' in list_attr:
    opt.do_tri()

  X = opt.x
  Y = opt.y

  list_attr = opt.__dict__.keys()
  if 'train_x' in list_attr:
    X_TRAIN = opt.train_x
    Y_TRAIN = opt.train_y

  dic_results = {}
  for isc in sorted(opt.xs):

    print "==========",opt.trad[isc],"=========="
    subdic = {}

    if isc > 0:
      if opt.trad[isc][0] == sta_prev:
        marker_sta = 1
      else:
        marker_sta = 0
        sta_prev = opt.trad[isc][0]
    else:
      marker_sta = 0
      sta_prev = opt.trad[isc][0]

    if len(opt.xs[isc]) == 0:
      continue


    # About the training set
    if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1 and 'train_x' not in list_attr:
      if os.path.exists(opt.opdict['train_file']):
        print opt.opdict['train_file']
        TRAIN_Y = read_binary_file(opt.opdict['train_file'])
      else:
        TRAIN_Y = {}
        for tir in range(opt.opdict['boot']):
          TRAIN_Y[tir] = {}
    elif 'train_x' in list_attr:
      opt.x = opt.xs_train[isc]
      opt.y = opt.ys_train[isc]
      if opt.opdict['plot_pdf']:
        opt.compute_pdfs()
        g_train = opt.gaussians
        del opt.gaussians
      opt.classname2number()
      x_ref_train = opt.x
      y_ref_train = opt.y


    # About the test set
    opt.x = opt.xs[isc]
    opt.y = opt.ys[isc]
    if opt.opdict['plot_pdf']:
      opt.compute_pdfs()
 
    set = pd.DataFrame(index=opt.ys[isc].index,columns=['Otime'])
    set['Otime'] = opt.xs[isc].index

    opt.classname2number()
    x_test = opt.x
    y_ref = opt.y
    x_ref = opt.x

    if opt.opdict['plot_dataset']:
      opt.composition_dataset()

    #K = len(opt.types)

    ### ITERATE OVER TRAINING SET DRAWS ###
    for b in range(opt.opdict['boot']):
      print "\n-------------------- # iter: %d --------------------\n"%(b+1)

      subsubdic = {}
      print "WHOLE SET", x_ref.shape, y_ref.shape

      ### if there is no pre-defined training set ###
      if 'train_x' not in list_attr:
        x_train = x_test.copy()
        if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1:
          if len(TRAIN_Y[b]) > 0:
            y_train = y_ref.reindex(index=TRAIN_Y[b]['training_set'])
            y_train = y_train.dropna(how='any')
            y_cv = y_ref.reindex(index=TRAIN_Y[b]['cv_set'])
            y_cv = y_cv.dropna(how='any')
            y_test = y_ref.reindex(index=TRAIN_Y[b]['test_set'])
            y_test = y_test.dropna(how='any')
          else:
            y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref)
            TRAIN_Y[b]['training_set'] = map(int,list(y_train.index))
            TRAIN_Y[b]['cv_set'] = map(int,list(y_cv.index))
            TRAIN_Y[b]['test_set'] = map(int,list(y_test.index))

        ### multi-stations case ###
        else:
          if marker_sta == 0:
            y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref)
            list_ev_train = y_train.index
            list_ev_cv = y_cv.index
            list_ev_test = y_test.index
          else:
            y_train = y_ref.reindex(index=list_ev_train)
            y_train = y_train.dropna(how='any')
            y_cv = y_ref.reindex(index=list_ev_cv)
            y_cv = y_cv.dropna(how='any')
            y_test = y_ref.reindex(index=list_ev_test)
            y_test = y_test.dropna(how='any')

        x_train = x_ref.reindex(index=y_train.index)

      ### if a training set was pre-defined ###
      else:
        x_train = x_ref_train.copy()
        y_train = y_ref_train.copy()
        y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref,y_train=y_train)

      x_cv = x_ref.reindex(index=y_cv.index)
      x_test = x_ref.reindex(index=y_test.index)

      i_train = y_train.index
      x_train.index = range(x_train.shape[0])
      y_train.index = range(y_train.shape[0])
      print "TRAINING SET", x_train.shape, y_train.shape
      if x_train.shape[0] != y_train.shape[0]:
        print "Training set: Incoherence in x and y dimensions"
        sys.exit()

      i_cv = y_cv.index
      x_cv.index = range(x_cv.shape[0])
      y_cv.index = range(y_cv.shape[0])
      print "CROSS-VALIDATION SET", x_cv.shape, y_cv.shape
      if x_cv.shape[0] != y_cv.shape[0]:
        print "Cross-validation set: Incoherence in x and y dimensions"
        sys.exit()

      subsubdic['list_ev'] = np.array(y_test.index)

      i_test = y_test.index
      x_test.index = range(x_test.shape[0])
      y_test.index = range(y_test.shape[0])
      print "TEST SET", x_test.shape, y_test.shape
      if x_test.shape[0] != y_test.shape[0]:
        print "Test set: Incoherence in x and y dimensions"
        sys.exit()

      opt.train_x = x_train
      opt.x = x_test
      opt.train_y = y_train
      opt.y = y_test

      if opt.opdict['plot_pdf']:
        opt.plot_all_pdfs(save=opt.opdict['save_pdf'])
        if 'train_x' in list_attr:
          opt.plot_superposed_pdfs(g_train,save=opt.opdict['save_pdf'])
        else:
          opt.plot_all_pdfs(save=opt.opdict['save_pdf'])

      if opt.opdict['method'] == '1b1':
        # EXTRACTEURS
        print "********** EXTRACTION 1-BY-1 **********"
        opt.opdict['boot'] = 1
        one_by_one(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm')
        continue

      elif opt.opdict['method'] == 'ova':
        print "********** EXTRACTION 1-VS-ALL **********"
        opt.opdict['boot'] = 1
        one_vs_all(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm')
        continue

      elif opt.opdict['method'] in ['svm','svm_nl']:
        # SVM
        print "********** SVM **********"
        if opt.opdict['method'] == 'svm':
          kern = 'Lin'
        else:
          kern = 'NonLin'

        out = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern=kern,proba=opt.opdict['probas'])

        if 'map' in sorted(out):
          opt.map = out['map']

        if 'thetas' in sorted(out):
          theta_vec = out['thetas']
          theta,threshold = {},{}
          for it in range(len(theta_vec)):
            theta[it+1] = np.append(theta_vec[it][-1],theta_vec[it][:-1])
            threshold[it+1] = 0.5
          out['thetas'] = theta
          out['threshold'] = threshold

      elif opt.opdict['method'] == 'lrsk':
        # LOGISTIC REGRESSION (scikit learn)
        print "********* Logistic regression (sklearn) **********"
        out = implement_lr_sklearn(x_train,x_test,y_train,y_test)
        threshold, theta = {},{}
        for it in range(len(out['thetas'])):
          threshold[it+1] = 0.5
          theta[it+1] = np.append(out['thetas'][it][-1],out['thetas'][it][:-1])
        out['threshold'] = threshold
        out['thetas'] = theta

      elif opt.opdict['method'] == 'lr':
        # LOGISTIC REGRESSION
        print "********* Logistic regression **********"
        from LR_functions import do_all_logistic_regression
        out = do_all_logistic_regression(x_train,x_test,x_cv,y_train,y_test,y_cv)
        theta = out['thetas']
        threshold = out['threshold']
        if 'learn_file' in sorted(opt.opdict):
          learn_filename = opt.opdict['learn_file']
          if not os.path.exists(learn_filename):
            wtr = write_binary_file(learn_filename,i_train)

      CLASS_test = out['label_test']
      CLASS_train = out['label_train']

      # TRAINING SET
      print "\t *TRAINING SET"
      y_train_np = y_train.NumType.values.ravel()  
      from sklearn.metrics import confusion_matrix
      cmat_train = confusion_matrix(y_train_np,CLASS_train)
      p_tr = dic_percent(cmat_train,opt.types,verbose=True)
      out['rate_train'] = p_tr
      print "   Global : %.2f%%"%p_tr['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_train,opt.types,'Training',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/training_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)

      # TEST SET
      print "\t *TEST SET"
      y_test_np = y_test.NumType.values.ravel()
      cmat_test = confusion_matrix(y_test_np,CLASS_test)
      p_test = dic_percent(cmat_test,opt.types,verbose=True)
      out['rate_test'] = p_test
      print "   Global : %.2f%%"%p_test['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_test,opt.types,'Test',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/test_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)
        if opt.opdict['plot_confusion']:
          plt.show()
        else:
          plt.close()

      # PLOT PRECISION AND RECALL
      if opt.opdict['plot_prec_rec']:
        from LR_functions import normalize,plot_precision_recall
        x_train, x_test = normalize(x_train,x_test)
        plot_precision_recall(x_train,y_train.NumType,x_test,y_test.NumType,theta)

      pourcentages = (p_tr['global'],p_test['global'])
      out['method'] = opt.opdict['method']
      out['types'] = opt.types
      opt.out = out

      # PLOT DECISION BOUNDARIES
      n_feat = x_train.shape[1] # number of features
      if n_feat < 4:
        if opt.opdict['plot_sep'] or opt.opdict['save_sep']:
          print "\nPLOTTING"
          print "Theta values:",theta
          print "Threshold:", threshold

          # COMPARE AND PLOT LR AND SVM RESULTS
          out_svm, out_nl = {},{}
          dir = '%s_SEP'%opt.opdict['method'].upper()
          if opt.opdict['method']=='lr' and opt.opdict['compare']:
            dir = 'LR_SVM_SEP'
            out_svm = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='Lin')
            cmat_svm_tr = confusion_matrix(y_train_np,out_svm['label_train'])
            cmat_svm_test = confusion_matrix(y_test_np,out_svm['label_test'])
            svm_ptr = dic_percent(cmat_svm_tr,opt.types)
            svm_pt = dic_percent(cmat_svm_test,opt.types)
            theta_svm,t_svm = {},{}
            for it in range(len(out_svm['thetas'])):
              theta_svm[it+1] = np.append(out_svm['thetas'][it][-1],out_svm['thetas'][it][:-1])
              t_svm[it+1] = 0.5
            out_svm['thetas'] = theta_svm
            out_svm['threshold'] = t_svm
            out_svm['rate_test'] = svm_pt
            out_svm['rate_train'] = svm_ptr
            out_svm['method'] = 'SVM'

          if opt.opdict['method'] in ['lr','svm'] and opt.opdict['compare_nl']:
            dir = '%s_NL_SEP'%opt.opdict['method'].upper()
            out_nl = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='NonLin')
            cmat_svm_tr = confusion_matrix(y_train_np,out_nl['label_train'])
            cmat_svm_test = confusion_matrix(y_test_np,out_nl['label_test'])
            svm_ptr = dic_percent(cmat_svm_tr,opt.types)
            svm_pt = dic_percent(cmat_svm_test,opt.types)
            out_nl['rate_test'] = svm_pt
            out_nl['rate_train'] = svm_ptr
            out_nl['method'] = 'SVM_NL'

          save_dir = os.path.join(opt.opdict['fig_path'],dir)
          opt.verify_and_create(save_dir)

          from LR_functions import normalize
          x_train, x_test = normalize(x_train,x_test)

          x_train_good = x_train.reindex(index=y_train[y_train.NumType.values==CLASS_train].index)
          x_train_bad = x_train.reindex(index=y_train[y_train.NumType.values!=CLASS_train].index)
          good_train = y_train.reindex(index=x_train_good.index)

          x_test_good = x_test.reindex(index=y_test[y_test.NumType.values==CLASS_test].index)
          x_test_bad = x_test.reindex(index=y_test[y_test.NumType.values!=CLASS_test].index)

          # PLOT FOR 1 ATTRIBUTE AND 2 CLASSES
          if n_feat == 1 and len(opt.opdict['types']) == 2:
            name = opt.opdict['feat_list'][0]
            from plot_functions import plot_hyp_func_1f, histo_pdfs
            if opt.opdict['method']=='lr' and opt.opdict['compare']:
              plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,th_comp=theta_svm,cmat_test=cmat_test,cmat_svm=cmat_svm_test,cmat_train=cmat_train)
            else:
              #histo_pdfs(x_test,y_test,x_train=x_train,y_train=y_train)
              plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,cmat_test=cmat_test,cmat_train=cmat_train)

          # PLOT FOR 2 ATTRIBUTES AND 2 to 3 CLASSES
          elif n_feat == 2:
            name = '%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1])
            if opt.opdict['method'] in ['lr','svm']:
              from plot_2features import plot_2f_all
              plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad)
            elif opt.opdict['method']=='lr' and opt.opdict['compare']:
              from plot_2features import plot_2f_all
              plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad,out_comp=out_svm,map_nl=out_nl)
            elif opt.opdict['method'] == 'svm_nl':
              from plot_2features import plot_2f_nonlinear
              plot_2f_nonlinear(out,x_train,y_train,x_test,y_test,y_train=y_train)

          # PLOT FOR 3 ATTRIBUTES
          elif n_feat == 3:
            from plot_functions import plot_db_3d
            plot_db_3d(x_train,y_train.NumType,theta[1],title='Training set')
            plot_db_3d(x_test,y_test.NumType,theta[1],title='Test set')
            name = '%s_%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1],opt.opdict['feat_list'][2])

          if opt.opdict['save_sep']:
            savename = '%s/CL_sep_%s.png'%(save_dir,name)
            print "Figure saved in %s"%savename
            plt.savefig(savename)
          if opt.opdict['plot_sep']:
            plt.show()
          else:
            plt.close()

      # WRITE RESULTS INTO A DICTIONARY
      subsubdic['%'] = pourcentages
      trad_CLASS_test = []
      for i in CLASS_test:
        i = int(i)
        trad_CLASS_test.append(opt.types[i])
      subsubdic['classification'] = trad_CLASS_test
      if opt.opdict['probas']:
        subsubdic['proba'] = out['probas']
      if opt.opdict['plot_var']:
        subsubdic['out'] = out
      subdic[b] = subsubdic

    if opt.opdict['plot_var'] and opt.opdict['method'] in ['lr','svm','lrsk'] and n_feat==2 and len(opt.opdict['types'])==2:
      from plot_2features import plot_2f_variability
      plot_2f_variability(subdic,x_train,y_train,x_test,y_test)
      plt.savefig('%s/%s_variability_pas.png'%(opt.opdict['fig_path'],opt.opdict['method'].upper()))
      plt.show()


    dic_results[opt.trad[isc]] = subdic

  dic_results['header'] = {}
  dic_results['header']['features'] = opt.opdict['feat_list']
  dic_results['header']['types'] = opt.opdict['types']
  dic_results['header']['catalog'] = opt.opdict['label_test']

  if opt.opdict['method'] in ['lr','lrsk','svm','svm_nl']:
    print "Save results in file %s"%opt.opdict['result_path']
    write_binary_file(opt.opdict['result_path'],dic_results)

  if 'train_file' in sorted(opt.opdict):
    if not os.path.exists(opt.opdict['train_file']) and opt.opdict['boot'] > 1:
      write_binary_file(opt.opdict['train_file'],TRAIN_Y)