Beispiel #1
0
def classifier(opt):

  """
  Classification of the different types of events.
  opt is an object of the class Options()
  By default, the number of classes K that is searched in the dataset is equal to 
  the number of classes in the catalog, but it can be modified directly in the 
  code.
  """
  opt.do_tri()

  X = opt.x
  Y = opt.y

  dic_results = {}
  for isc in sorted(opt.xs):

    print "==========",opt.trad[isc],"=========="
    subdic = {}

    if isc > 0:
      if opt.trad[isc][0] == sta_prev:
        marker_sta = 1
      else:
        marker_sta = 0
        sta_prev = opt.trad[isc][0]
    else:
      marker_sta = 0
      sta_prev = opt.trad[isc][0]

    if len(opt.xs[isc]) == 0:
      continue

    x_test = opt.xs[isc]
    y_test = opt.ys[isc]

    opt.classname2number()
    K = len(opt.types)

    for b in range(opt.opdict['boot']):
      print "\n-------------------- # iter: %d --------------------\n"%(b+1)

      subsubdic = {}

      print "# types in the test set:",len(opt.types)

      subsubdic['list_ev'] = np.array(y_test.index)

      x_test.index = range(x_test.shape[0])
      y_test.index = range(y_test.shape[0])
      print x_test.shape, y_test.shape
      if x_test.shape[0] != y_test.shape[0]:
        print "Test set: Incoherence in x and y dimensions"
        sys.exit()

      if opt.opdict['method'] == 'kmeans':
        # K-Mean
        print "********** KMean **********"
        K=2
        CLASS_test = implement_kmean(x_test,K)


      trad,dicocl = {},{}
      for t in opt.types:
        dicocl[t] = []
      for i in range(K):
        auto = CLASS_test[CLASS_test.values.ravel()==i]
        man = y_test.reindex(index=auto.index,columns=['Type'])
        print "Size of class %d : %d"%(i,len(auto))

        nbs = [len(man[man.values.ravel()==j]) for j in opt.types]
        trad[i] = np.array(opt.types)[np.argsort(nbs)]
        for j in range(len(opt.types)):
          print "\tNumber of %s : %d"%(trad[i][j],np.sort(nbs)[j])
          dicocl[trad[i][j]].append(np.sort(nbs)[j])

      if K == len(opt.types):
        types_trad = np.array(opt.types).copy()
        for key in sorted(dicocl):
          types_trad[np.argmax(dicocl[key])] = key
      else:
        types_trad=[]

      ### PLOT DIAGRAMS ###
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        if K > 4:
          plot_diagrams(CLASS_test,y_test)
          #results_histo(CLASS_test,y_test)
          results_diagrams(CLASS_test,y_test)
        else:
          all_diagrams(CLASS_test,y_test,trad=types_trad)
        if opt.opdict['save_confusion']:
          savefig = '%s/unsup_diagrams_%df_ini.png'%(opt.opdict['fig_path'],len(opt.opdict['feat_list']))
          plt.savefig(savefig)
          print "Figure saved in %s"%savefig
        if opt.opdict['plot_confusion']:
          plt.show()
        else:
          plt.close()

      opt.x = x_test
      opt.y = y_test

      if opt.opdict['plot_pdf']:
        opt.compute_pdfs()
        g_test = opt.gaussians
      
        opt.y = CLASS_test
        opt.compute_pdfs()
        g_unsup = opt.gaussians

        plot_and_compare_pdfs(g_test,g_unsup)
      
      subsubdic['NumClass'] = CLASS_test.values.ravel()
      if list(types_trad):
        trad_CLASS_test = []
        for i in CLASS_test.values:
          i = int(i)
          trad_CLASS_test.append(types_trad[i])
        subsubdic['StrClass'] = trad_CLASS_test
        subsubdic['Equivalence'] = types_trad
      subdic[b] = subsubdic

    dic_results[opt.trad[isc]] = subdic

  dic_results['header'] = {}
  dic_results['header']['features'] = opt.opdict['feat_list']
  dic_results['header']['types'] = opt.opdict['types']
  dic_results['header']['catalog'] = opt.opdict['label_test']

  print "Save results in file %s"%opt.opdict['result_path']
  write_binary_file(opt.opdict['result_path'],dic_results)
Beispiel #2
0
def classifier(opt):
    """
  Classification of the different types of events.
  opt is an object of the class Options()
  By default, the number of classes K that is searched in the dataset is equal to 
  the number of classes in the catalog, but it can be modified directly in the 
  code.
  """
    opt.do_tri()

    X = opt.x
    Y = opt.y

    dic_results = {}
    for isc in sorted(opt.xs):

        print "==========", opt.trad[isc], "=========="
        subdic = {}

        if isc > 0:
            if opt.trad[isc][0] == sta_prev:
                marker_sta = 1
            else:
                marker_sta = 0
                sta_prev = opt.trad[isc][0]
        else:
            marker_sta = 0
            sta_prev = opt.trad[isc][0]

        if len(opt.xs[isc]) == 0:
            continue

        x_test = opt.xs[isc]
        y_test = opt.ys[isc]

        opt.classname2number()
        K = len(opt.types)

        for b in range(opt.opdict['boot']):
            print "\n-------------------- # iter: %d --------------------\n" % (
                b + 1)

            subsubdic = {}

            print "# types in the test set:", len(opt.types)

            subsubdic['list_ev'] = np.array(y_test.index)

            x_test.index = range(x_test.shape[0])
            y_test.index = range(y_test.shape[0])
            print x_test.shape, y_test.shape
            if x_test.shape[0] != y_test.shape[0]:
                print "Test set: Incoherence in x and y dimensions"
                sys.exit()

            if opt.opdict['method'] == 'kmeans':
                # K-Mean
                print "********** KMean **********"
                K = 2
                CLASS_test = implement_kmean(x_test, K)

            trad, dicocl = {}, {}
            for t in opt.types:
                dicocl[t] = []
            for i in range(K):
                auto = CLASS_test[CLASS_test.values.ravel() == i]
                man = y_test.reindex(index=auto.index, columns=['Type'])
                print "Size of class %d : %d" % (i, len(auto))

                nbs = [len(man[man.values.ravel() == j]) for j in opt.types]
                trad[i] = np.array(opt.types)[np.argsort(nbs)]
                for j in range(len(opt.types)):
                    print "\tNumber of %s : %d" % (trad[i][j], np.sort(nbs)[j])
                    dicocl[trad[i][j]].append(np.sort(nbs)[j])

            if K == len(opt.types):
                types_trad = np.array(opt.types).copy()
                for key in sorted(dicocl):
                    types_trad[np.argmax(dicocl[key])] = key
            else:
                types_trad = []

            ### PLOT DIAGRAMS ###
            if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
                if K > 4:
                    plot_diagrams(CLASS_test, y_test)
                    #results_histo(CLASS_test,y_test)
                    results_diagrams(CLASS_test, y_test)
                else:
                    all_diagrams(CLASS_test, y_test, trad=types_trad)
                if opt.opdict['save_confusion']:
                    savefig = '%s/unsup_diagrams_%df_ini.png' % (
                        opt.opdict['fig_path'], len(opt.opdict['feat_list']))
                    plt.savefig(savefig)
                    print "Figure saved in %s" % savefig
                if opt.opdict['plot_confusion']:
                    plt.show()
                else:
                    plt.close()

            opt.x = x_test
            opt.y = y_test

            if opt.opdict['plot_pdf']:
                opt.compute_pdfs()
                g_test = opt.gaussians

                opt.y = CLASS_test
                opt.compute_pdfs()
                g_unsup = opt.gaussians

                plot_and_compare_pdfs(g_test, g_unsup)

            subsubdic['NumClass'] = CLASS_test.values.ravel()
            if list(types_trad):
                trad_CLASS_test = []
                for i in CLASS_test.values:
                    i = int(i)
                    trad_CLASS_test.append(types_trad[i])
                subsubdic['StrClass'] = trad_CLASS_test
                subsubdic['Equivalence'] = types_trad
            subdic[b] = subsubdic

        dic_results[opt.trad[isc]] = subdic

    dic_results['header'] = {}
    dic_results['header']['features'] = opt.opdict['feat_list']
    dic_results['header']['types'] = opt.opdict['types']
    dic_results['header']['catalog'] = opt.opdict['label_test']

    print "Save results in file %s" % opt.opdict['result_path']
    write_binary_file(opt.opdict['result_path'], dic_results)
def one_by_one(opt,x_test_ref0,y_test_ref0,otimes_ref,boot=1,method='lr'):

  """
  Per class extractor.
  Extract one class after each other by order of importance. The events which are 
  classified are deleted from the next extraction.
  boot = number of training sets to be generated
  method = 'lr' for Logistic Regression / 'svm' for SVM
  """

  from LR_functions import do_all_logistic_regression

  types = opt.types
  numt = opt.numt

  len_numt = len(numt)
  # Dictionary for results
  DIC = {}
  DIC['features'] = x_test_ref0.columns 

  EXT = {}
  for num_ext in range(len_numt):
    EXT[num_ext] = {}
    EXT[num_ext]['nb_tot'] = []
    for t in numt:
      EXT[num_ext]['nb_%s'%types[t]] = []

  p_train, p_cv, p_test = opt.opdict['proportions']

  for b in range(boot):

    otimes = map(int,list(otimes_ref.values))
    otimes = np.array(otimes)

    x_test_ref = x_test_ref0.copy()
    y_test_ref = y_test_ref0.copy()

    print "\n\tONE BY ONE EXTRACTION ------ iteration %d"%b
    dic = {}

    inum = 0
    for n in range(len_numt):

      sub_dic={}

      ### Splitting of the whole set in training, CV and test sets ###
      y_train_ref, y_cv, y_test_ref = generate_datasets(opt.opdict['proportions'],opt.numt,y_test_ref)
      y_test_ref = pd.concat([y_cv,y_test_ref])
      i_train = y_train_ref.index
      i_cv = y_cv.index
      i_test = y_test_ref.index

      ### Defining the training set ###
      x_train = x_test_ref.reindex(index=y_train_ref.index)
      y_train_ref.index = range(y_train_ref.shape[0])
      x_train.index = range(x_train.shape[0])
      if inum == 0:
        list_i_train = [list(otimes[map(int,list(y_train_ref.index))])]
      else:
        list_i_train.append(list(otimes[map(int,list(y_train_ref.index))]))

      ### Defining the test set ###
      x_test = x_test_ref.reindex(index=y_test_ref.index)
      x_test.index = range(x_test.shape[0])
      y_test_ref.index = range(y_test_ref.shape[0])
      if inum == 0:
        list_i_test = [list(otimes[map(int,list(y_test_ref.index))])]
      else:
        list_i_test.append(list(otimes[map(int,list(y_test_ref.index))]))

      if x_train.shape[0] != y_train_ref.shape[0]:
        print "Training set: Incoherence in x and y dimensions"
        sys.exit()

      if x_test.shape[0] != y_test_ref.shape[0]:
        print "Test set: Incoherence in x and y dimensions"
        sys.exit()

      y_train = y_train_ref.copy()
      y_test = y_test_ref.copy()

      EXT[n]['nb_tot'].append(len(x_test))
      for t in numt:
        EXT[n]['nb_%s'%types[t]].append(len(y_test[y_test.NumType==t]))

      y_train[y_train_ref.NumType==n] = 0
      y_test[y_test_ref.NumType==n] = 0
      y_train[y_train_ref.NumType!=n] = 1
      y_test[y_test_ref.NumType!=n] = 1

      t = [types[n],'Rest']
      print y_train.shape[0], y_test.shape[0]

      print "----------- %s vs all -----------"%types[n]

      if method == 'lr':
        print "Logistic Regression\n"
        out = do_all_logistic_regression(x_test_ref0,y_test_ref0,i_train,i_cv,i_test,)
      elif method == 'svm':
        kern = 'NonLin'
        print "SVM\n"
        out = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern=kern)

      CLASS_test = out['label_test']
      CLASS_train = out['label_train']

      # TRAINING SET
      print "\t *TRAINING SET"
      y_train_np = y_train.NumType.values.ravel()  
      from sklearn.metrics import confusion_matrix
      cmat_train = confusion_matrix(y_train_np,CLASS_train)
      p_tr = dic_percent(cmat_train,[types[n],'Rest'],verbose=True)
      out['rate_train'] = p_tr
      print "   Global : %.2f%%"%p_tr['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_train,opt.types,'Training',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/training_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)

      # TEST SET
      print "\t *TEST SET"
      y_test_np = y_test.NumType.values.ravel()
      cmat_test = confusion_matrix(y_test_np,CLASS_test)
      p_test = dic_percent(cmat_test,[types[n],'Rest'],verbose=True)
      out['rate_test'] = p_test
      print "   Global : %.2f%%"%p_test['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_test,opt.types,'Test',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/test_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)
        if opt.opdict['plot_confusion']:
          plt.show()
        else:
          plt.close()

      # Fill the dictionary
      i_com = np.where((y_test.NumType.values.ravel()-CLASS_test)==0)[0]
      i_lr = np.where(CLASS_test==0)[0]
      i_ok_class = np.intersect1d(i_com,i_lr) # events classified in the class of interest by the LR and identical to the manual classification

      sub_dic["nb"] = len(i_lr) # total number of events classified in the class of interest
      sub_dic["nb_common"] = len(i_ok_class)
      sub_dic["index_ok"] = otimes[i_ok_class]
      sub_dic["nb_other"],sub_dic["i_other"] = [],[]
      for k in range(len_numt):
        if k != n:
          i_other_man = list(y_test_ref[y_test_ref.NumType==k].index)
          ii = np.intersect1d(i_lr,i_other_man)
          sub_dic["nb_other"].append((types[k],len(ii)))
          sub_dic["i_other"].append((types[k],otimes[ii]))
      sub_dic["rate_%s"%types[n]] = (out['rate_train'][('%s'%types[n], 0)], out['rate_test'][('%s'%types[n], 0)])
      sub_dic["rate_rest"] = (out['rate_train'][('Rest', 1)], out['rate_test'][('Rest', 1)])
      sub_dic["nb_manuals"] = ((types[n],len(y_test[y_test.NumType==0])),('Rest',len(y_test[y_test.NumType==1])))

      i_ok_test = i_test[np.where(CLASS_test!=0)[0]]
      i_ok_train = i_train[np.where(CLASS_train!=0)[0]]
      i_ok = np.concatenate([i_ok_test,i_ok_train])
      otimes = i_ok
      y_test_ref = y_test_ref0.reindex(index=map(int,list(i_ok)))

      dic[types[n]] = sub_dic
      inum = inum + 1

    dic['i_train'] = list_i_train
    dic['i_test'] = list_i_test
    DIC[b] = dic

  file = opt.opdict['result_path']
  print "One-by-One results stored in %s"%file
  write_binary_file(file,DIC)

  file = '%s/stats_OBO'%os.path.dirname(opt.opdict['result_path'])
  write_binary_file(file,EXT)
def one_vs_all(opt,x_test_ref,y_test_ref,otimes_ref,boot=1,method='lr'):

  """
  Extract one class among the whole data.
  One vs All extractor.
  """

  from LR_functions import do_all_logistic_regression

  types = opt.types
  numt = opt.numt
  len_numt = len(numt)

  DIC = {}
  DIC['features'] = x_test_ref.columns
  for b in range(boot):

    print "\n\tONE VS ALL EXTRACTION ------ iteration %d"%b

    dic = {}
    otimes = map(str,list(otimes_ref.values))
    otimes = np.array(otimes)

    ### Splitting of the whole set in training, CV and test sets ###
    y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_test_ref)
    i_train = y_train.index
    i_cv = y_cv.index
    i_test = y_test.index

    ### Defining the training set ###
    x_train = x_test_ref.reindex(index=y_train.index)
    y_train.index = range(y_train.shape[0])
    x_train.index = range(x_train.shape[0])
    dic["i_train"] = otimes[map(int,list(y_train.index))]

    ### Defining the test set ###
    x_test = x_test_ref.reindex(index=y_test.index)
    x_test.index = range(x_test.shape[0])
    y_test.index = range(y_test.shape[0])
    dic["i_test"] = otimes[map(int,list(y_test.index))]

    y_train_tir = y_train.copy()
    y_test_tir = y_test.copy()

    for n in range(len_numt):

      y_train[y_train_tir.NumType==n] = 0
      y_test[y_test_tir.NumType==n] = 0
      y_train[y_train_tir.NumType!=n] = 1
      y_test[y_test_tir.NumType!=n] = 1

      print y_train.shape[0], y_test.shape[0]

      print "----------- %s vs all -----------"%types[n]
      print_type = [types[n],'All']

      if method == 'lr':
        print "Logistic Regression\n"
        i_train = y_train.index
        i_cv = y_cv.index
        i_test = y_test.index
        out = do_all_logistic_regression(x_test_ref,y_test_ref,i_train,i_cv,i_test)
      elif method == 'svm':
        kern = 'NonLin'
        print "SVM\n"
        out = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern=kern)

      CLASS_test = out['label_test']
      CLASS_train = out['label_train']

      # TRAINING SET
      print "\t *TRAINING SET"
      y_train_np = y_train.NumType.values.ravel()  
      from sklearn.metrics import confusion_matrix
      cmat_train = confusion_matrix(y_train_np,CLASS_train)
      p_tr = dic_percent(cmat_train,[types[n],'Rest'],verbose=True)
      out['rate_train'] = p_tr
      print "   Global : %.2f%%"%p_tr['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_train,opt.types,'Training',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/training_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)

      # TEST SET
      print "\t *TEST SET"
      y_test_np = y_test.NumType.values.ravel()
      cmat_test = confusion_matrix(y_test_np,CLASS_test)
      p_test = dic_percent(cmat_test,[types[n],'Rest'],verbose=True)
      out['rate_test'] = p_test
      print "   Global : %.2f%%"%p_test['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_test,opt.types,'Test',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/test_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)
        if opt.opdict['plot_confusion']:
          plt.show()
        else:
          plt.close()


      # Fill the dictionary
      sub_dic={}
      i_com = np.where((y_test.NumType.values.ravel()-CLASS_test)==0)[0]
      i_lr = np.where(CLASS_test==0)[0]
      i_ok_class = np.intersect1d(i_com,i_lr) # events classified in the class of interest by the LR and identical to the manual classification
      sub_dic["nb"] = len(i_lr) # total number of events classified in the class of interest
      sub_dic["nb_common"] = len(i_ok_class) # total number of well classified events
      sub_dic["index_ok"] = otimes[i_ok_class] # index of well classified events
      sub_dic["nb_other"],sub_dic["i_other"] = [],[]
      for k in range(len_numt):
        if k != n:
          i_other_man = list(y_test_tir[y_test_tir.NumType==k].index)
          ii = np.intersect1d(i_lr,i_other_man)
          sub_dic["nb_other"].append((types[k],len(ii))) # number of events belonging to another class
          sub_dic["i_other"].append((types[k],otimes[ii])) # index of events belonging to another class
      sub_dic["rate_%s"%types[n]] = (out['rate_train'][('%s'%types[n], 0)], out['rate_test'][('%s'%types[n], 0)]) # % success rate of the extracted class
      sub_dic["rate_rest"] = (out['rate_train'][('Rest', 1)], out['rate_test'][('Rest', 1)]) # % success rate of the rest
      sub_dic["nb_manuals"] = ((types[n],len(y_test[y_test.NumType==0])),('Rest',len(y_test[y_test.NumType==1])))
      dic[types[n]] = sub_dic

    DIC[b] = dic

  file = opt.opdict['result_path']
  print "One-vs-All results stored in %s"%file
  write_binary_file(file,DIC)
def classifier(opt):
  """
  Classification of the different types of events.
  opt is an object of the class Options()
  """

  list_attr = opt.__dict__.keys()
  if not 'x' in list_attr:
    opt.do_tri()

  X = opt.x
  Y = opt.y

  list_attr = opt.__dict__.keys()
  if 'train_x' in list_attr:
    X_TRAIN = opt.train_x
    Y_TRAIN = opt.train_y

  dic_results = {}
  for isc in sorted(opt.xs):

    print "==========",opt.trad[isc],"=========="
    subdic = {}

    if isc > 0:
      if opt.trad[isc][0] == sta_prev:
        marker_sta = 1
      else:
        marker_sta = 0
        sta_prev = opt.trad[isc][0]
    else:
      marker_sta = 0
      sta_prev = opt.trad[isc][0]

    if len(opt.xs[isc]) == 0:
      continue


    # About the training set
    if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1 and 'train_x' not in list_attr:
      if os.path.exists(opt.opdict['train_file']):
        print opt.opdict['train_file']
        TRAIN_Y = read_binary_file(opt.opdict['train_file'])
      else:
        TRAIN_Y = {}
        for tir in range(opt.opdict['boot']):
          TRAIN_Y[tir] = {}
    elif 'train_x' in list_attr:
      opt.x = opt.xs_train[isc]
      opt.y = opt.ys_train[isc]
      if opt.opdict['plot_pdf']:
        opt.compute_pdfs()
        g_train = opt.gaussians
        del opt.gaussians
      opt.classname2number()
      x_ref_train = opt.x
      y_ref_train = opt.y


    # About the test set
    opt.x = opt.xs[isc]
    opt.y = opt.ys[isc]
    if opt.opdict['plot_pdf']:
      opt.compute_pdfs()
 
    set = pd.DataFrame(index=opt.ys[isc].index,columns=['Otime'])
    set['Otime'] = opt.xs[isc].index

    opt.classname2number()
    x_test = opt.x
    y_ref = opt.y
    x_ref = opt.x

    if opt.opdict['plot_dataset']:
      opt.composition_dataset()

    #K = len(opt.types)

    ### ITERATE OVER TRAINING SET DRAWS ###
    for b in range(opt.opdict['boot']):
      print "\n-------------------- # iter: %d --------------------\n"%(b+1)

      subsubdic = {}
      print "WHOLE SET", x_ref.shape, y_ref.shape

      ### if there is no pre-defined training set ###
      if 'train_x' not in list_attr:
        x_train = x_test.copy()
        if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1:
          if len(TRAIN_Y[b]) > 0:
            y_train = y_ref.reindex(index=TRAIN_Y[b]['training_set'])
            y_train = y_train.dropna(how='any')
            y_cv = y_ref.reindex(index=TRAIN_Y[b]['cv_set'])
            y_cv = y_cv.dropna(how='any')
            y_test = y_ref.reindex(index=TRAIN_Y[b]['test_set'])
            y_test = y_test.dropna(how='any')
          else:
            y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref)
            TRAIN_Y[b]['training_set'] = map(int,list(y_train.index))
            TRAIN_Y[b]['cv_set'] = map(int,list(y_cv.index))
            TRAIN_Y[b]['test_set'] = map(int,list(y_test.index))

        ### multi-stations case ###
        else:
          if marker_sta == 0:
            y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref)
            list_ev_train = y_train.index
            list_ev_cv = y_cv.index
            list_ev_test = y_test.index
          else:
            y_train = y_ref.reindex(index=list_ev_train)
            y_train = y_train.dropna(how='any')
            y_cv = y_ref.reindex(index=list_ev_cv)
            y_cv = y_cv.dropna(how='any')
            y_test = y_ref.reindex(index=list_ev_test)
            y_test = y_test.dropna(how='any')

        x_train = x_ref.reindex(index=y_train.index)

      ### if a training set was pre-defined ###
      else:
        x_train = x_ref_train.copy()
        y_train = y_ref_train.copy()
        y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref,y_train=y_train)

      x_cv = x_ref.reindex(index=y_cv.index)
      x_test = x_ref.reindex(index=y_test.index)

      i_train = y_train.index
      x_train.index = range(x_train.shape[0])
      y_train.index = range(y_train.shape[0])
      print "TRAINING SET", x_train.shape, y_train.shape
      if x_train.shape[0] != y_train.shape[0]:
        print "Training set: Incoherence in x and y dimensions"
        sys.exit()

      i_cv = y_cv.index
      x_cv.index = range(x_cv.shape[0])
      y_cv.index = range(y_cv.shape[0])
      print "CROSS-VALIDATION SET", x_cv.shape, y_cv.shape
      if x_cv.shape[0] != y_cv.shape[0]:
        print "Cross-validation set: Incoherence in x and y dimensions"
        sys.exit()

      subsubdic['list_ev'] = np.array(y_test.index)

      i_test = y_test.index
      x_test.index = range(x_test.shape[0])
      y_test.index = range(y_test.shape[0])
      print "TEST SET", x_test.shape, y_test.shape
      if x_test.shape[0] != y_test.shape[0]:
        print "Test set: Incoherence in x and y dimensions"
        sys.exit()

      opt.train_x = x_train
      opt.x = x_test
      opt.train_y = y_train
      opt.y = y_test

      if opt.opdict['plot_pdf']:
        opt.plot_all_pdfs(save=opt.opdict['save_pdf'])
        if 'train_x' in list_attr:
          opt.plot_superposed_pdfs(g_train,save=opt.opdict['save_pdf'])
        else:
          opt.plot_all_pdfs(save=opt.opdict['save_pdf'])

      if opt.opdict['method'] == '1b1':
        # EXTRACTEURS
        print "********** EXTRACTION 1-BY-1 **********"
        opt.opdict['boot'] = 1
        one_by_one(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm')
        continue

      elif opt.opdict['method'] == 'ova':
        print "********** EXTRACTION 1-VS-ALL **********"
        opt.opdict['boot'] = 1
        one_vs_all(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm')
        continue

      elif opt.opdict['method'] in ['svm','svm_nl']:
        # SVM
        print "********** SVM **********"
        if opt.opdict['method'] == 'svm':
          kern = 'Lin'
        else:
          kern = 'NonLin'

        out = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern=kern,proba=opt.opdict['probas'])

        if 'map' in sorted(out):
          opt.map = out['map']

        if 'thetas' in sorted(out):
          theta_vec = out['thetas']
          theta,threshold = {},{}
          for it in range(len(theta_vec)):
            theta[it+1] = np.append(theta_vec[it][-1],theta_vec[it][:-1])
            threshold[it+1] = 0.5
          out['thetas'] = theta
          out['threshold'] = threshold

      elif opt.opdict['method'] == 'lrsk':
        # LOGISTIC REGRESSION (scikit learn)
        print "********* Logistic regression (sklearn) **********"
        out = implement_lr_sklearn(x_train,x_test,y_train,y_test)
        threshold, theta = {},{}
        for it in range(len(out['thetas'])):
          threshold[it+1] = 0.5
          theta[it+1] = np.append(out['thetas'][it][-1],out['thetas'][it][:-1])
        out['threshold'] = threshold
        out['thetas'] = theta

      elif opt.opdict['method'] == 'lr':
        # LOGISTIC REGRESSION
        print "********* Logistic regression **********"
        from LR_functions import do_all_logistic_regression
        out = do_all_logistic_regression(x_train,x_test,x_cv,y_train,y_test,y_cv)
        theta = out['thetas']
        threshold = out['threshold']
        if 'learn_file' in sorted(opt.opdict):
          learn_filename = opt.opdict['learn_file']
          if not os.path.exists(learn_filename):
            wtr = write_binary_file(learn_filename,i_train)

      CLASS_test = out['label_test']
      CLASS_train = out['label_train']

      # TRAINING SET
      print "\t *TRAINING SET"
      y_train_np = y_train.NumType.values.ravel()  
      from sklearn.metrics import confusion_matrix
      cmat_train = confusion_matrix(y_train_np,CLASS_train)
      p_tr = dic_percent(cmat_train,opt.types,verbose=True)
      out['rate_train'] = p_tr
      print "   Global : %.2f%%"%p_tr['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_train,opt.types,'Training',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/training_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)

      # TEST SET
      print "\t *TEST SET"
      y_test_np = y_test.NumType.values.ravel()
      cmat_test = confusion_matrix(y_test_np,CLASS_test)
      p_test = dic_percent(cmat_test,opt.types,verbose=True)
      out['rate_test'] = p_test
      print "   Global : %.2f%%"%p_test['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_test,opt.types,'Test',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/test_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)
        if opt.opdict['plot_confusion']:
          plt.show()
        else:
          plt.close()

      # PLOT PRECISION AND RECALL
      if opt.opdict['plot_prec_rec']:
        from LR_functions import normalize,plot_precision_recall
        x_train, x_test = normalize(x_train,x_test)
        plot_precision_recall(x_train,y_train.NumType,x_test,y_test.NumType,theta)

      pourcentages = (p_tr['global'],p_test['global'])
      out['method'] = opt.opdict['method']
      out['types'] = opt.types
      opt.out = out

      # PLOT DECISION BOUNDARIES
      n_feat = x_train.shape[1] # number of features
      if n_feat < 4:
        if opt.opdict['plot_sep'] or opt.opdict['save_sep']:
          print "\nPLOTTING"
          print "Theta values:",theta
          print "Threshold:", threshold

          # COMPARE AND PLOT LR AND SVM RESULTS
          out_svm, out_nl = {},{}
          dir = '%s_SEP'%opt.opdict['method'].upper()
          if opt.opdict['method']=='lr' and opt.opdict['compare']:
            dir = 'LR_SVM_SEP'
            out_svm = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='Lin')
            cmat_svm_tr = confusion_matrix(y_train_np,out_svm['label_train'])
            cmat_svm_test = confusion_matrix(y_test_np,out_svm['label_test'])
            svm_ptr = dic_percent(cmat_svm_tr,opt.types)
            svm_pt = dic_percent(cmat_svm_test,opt.types)
            theta_svm,t_svm = {},{}
            for it in range(len(out_svm['thetas'])):
              theta_svm[it+1] = np.append(out_svm['thetas'][it][-1],out_svm['thetas'][it][:-1])
              t_svm[it+1] = 0.5
            out_svm['thetas'] = theta_svm
            out_svm['threshold'] = t_svm
            out_svm['rate_test'] = svm_pt
            out_svm['rate_train'] = svm_ptr
            out_svm['method'] = 'SVM'

          if opt.opdict['method'] in ['lr','svm'] and opt.opdict['compare_nl']:
            dir = '%s_NL_SEP'%opt.opdict['method'].upper()
            out_nl = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='NonLin')
            cmat_svm_tr = confusion_matrix(y_train_np,out_nl['label_train'])
            cmat_svm_test = confusion_matrix(y_test_np,out_nl['label_test'])
            svm_ptr = dic_percent(cmat_svm_tr,opt.types)
            svm_pt = dic_percent(cmat_svm_test,opt.types)
            out_nl['rate_test'] = svm_pt
            out_nl['rate_train'] = svm_ptr
            out_nl['method'] = 'SVM_NL'

          save_dir = os.path.join(opt.opdict['fig_path'],dir)
          opt.verify_and_create(save_dir)

          from LR_functions import normalize
          x_train, x_test = normalize(x_train,x_test)

          x_train_good = x_train.reindex(index=y_train[y_train.NumType.values==CLASS_train].index)
          x_train_bad = x_train.reindex(index=y_train[y_train.NumType.values!=CLASS_train].index)
          good_train = y_train.reindex(index=x_train_good.index)

          x_test_good = x_test.reindex(index=y_test[y_test.NumType.values==CLASS_test].index)
          x_test_bad = x_test.reindex(index=y_test[y_test.NumType.values!=CLASS_test].index)

          # PLOT FOR 1 ATTRIBUTE AND 2 CLASSES
          if n_feat == 1 and len(opt.opdict['types']) == 2:
            name = opt.opdict['feat_list'][0]
            from plot_functions import plot_hyp_func_1f, histo_pdfs
            if opt.opdict['method']=='lr' and opt.opdict['compare']:
              plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,th_comp=theta_svm,cmat_test=cmat_test,cmat_svm=cmat_svm_test,cmat_train=cmat_train)
            else:
              #histo_pdfs(x_test,y_test,x_train=x_train,y_train=y_train)
              plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,cmat_test=cmat_test,cmat_train=cmat_train)

          # PLOT FOR 2 ATTRIBUTES AND 2 to 3 CLASSES
          elif n_feat == 2:
            name = '%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1])
            if opt.opdict['method'] in ['lr','svm']:
              from plot_2features import plot_2f_all
              plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad)
            elif opt.opdict['method']=='lr' and opt.opdict['compare']:
              from plot_2features import plot_2f_all
              plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad,out_comp=out_svm,map_nl=out_nl)
            elif opt.opdict['method'] == 'svm_nl':
              from plot_2features import plot_2f_nonlinear
              plot_2f_nonlinear(out,x_train,y_train,x_test,y_test,y_train=y_train)

          # PLOT FOR 3 ATTRIBUTES
          elif n_feat == 3:
            from plot_functions import plot_db_3d
            plot_db_3d(x_train,y_train.NumType,theta[1],title='Training set')
            plot_db_3d(x_test,y_test.NumType,theta[1],title='Test set')
            name = '%s_%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1],opt.opdict['feat_list'][2])

          if opt.opdict['save_sep']:
            savename = '%s/CL_sep_%s.png'%(save_dir,name)
            print "Figure saved in %s"%savename
            plt.savefig(savename)
          if opt.opdict['plot_sep']:
            plt.show()
          else:
            plt.close()

      # WRITE RESULTS INTO A DICTIONARY
      subsubdic['%'] = pourcentages
      trad_CLASS_test = []
      for i in CLASS_test:
        i = int(i)
        trad_CLASS_test.append(opt.types[i])
      subsubdic['classification'] = trad_CLASS_test
      if opt.opdict['probas']:
        subsubdic['proba'] = out['probas']
      if opt.opdict['plot_var']:
        subsubdic['out'] = out
      subdic[b] = subsubdic

    if opt.opdict['plot_var'] and opt.opdict['method'] in ['lr','svm','lrsk'] and n_feat==2 and len(opt.opdict['types'])==2:
      from plot_2features import plot_2f_variability
      plot_2f_variability(subdic,x_train,y_train,x_test,y_test)
      plt.savefig('%s/%s_variability_pas.png'%(opt.opdict['fig_path'],opt.opdict['method'].upper()))
      plt.show()


    dic_results[opt.trad[isc]] = subdic

  dic_results['header'] = {}
  dic_results['header']['features'] = opt.opdict['feat_list']
  dic_results['header']['types'] = opt.opdict['types']
  dic_results['header']['catalog'] = opt.opdict['label_test']

  if opt.opdict['method'] in ['lr','lrsk','svm','svm_nl']:
    print "Save results in file %s"%opt.opdict['result_path']
    write_binary_file(opt.opdict['result_path'],dic_results)

  if 'train_file' in sorted(opt.opdict):
    if not os.path.exists(opt.opdict['train_file']) and opt.opdict['boot'] > 1:
      write_binary_file(opt.opdict['train_file'],TRAIN_Y)