Beispiel #1
0
def plot_sep(opt):

    opt.set_params()
    opt.opdict['fig_path'] = os.path.join(opt.opdict['outdir'],'figures')

    from do_classification import classifier
    ### LINEAR SVM ###
    opt.opdict['method'] = 'svm'
    classifier(opt)
    #opt.plot_PDFs()
    out_svm = opt.out
    print "SVM", out_svm['thetas']

    x_train = opt.train_x
    x_test = opt.x
    y_train = opt.train_y
    y_test = opt.y

    # *** Plot ***
    plot_2f_synthetics(out_svm,x_train,x_test,y_test,y_train=y_train)
    #plt.savefig('%s/Test_%dc_%s_SVM.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep))
    plt.show()

    ### LOGISTIC REGRESSION ###
    opt.opdict['method'] = 'lr'
    out_lr = {}
    for b in range(1):
      opt.opdict['learn_file'] = os.path.join(opt.opdict['libdir'],'LR_%d'%b)
      #os.remove(opt.opdict['learn_file'])
      classifier(opt)
      out_lr[b] = opt.out

    print "LR", out_lr[0]['thetas']

    # *** Plot ***
    if b == 0:
      print sorted(out_lr[0])
      plot_2f_synthetics(out_lr[0],x_train,x_test,y_test,y_train=y_train)
      #plt.savefig('%s/Test_%dc_%s_LR.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep))
      plt.show()

    else:
      plot_2f_synth_var(out_lr,x_train,x_test,y_test,opt.NB_test)
      #plt.savefig('%s/Test_%dc_LR_%s.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep))
      plt.show()

    if opt.opdict['plot_prec_rec']:
      plot_2f_synth_var(out_lr,x_train,x_test,y_test,opt.NB_test)
      #plt.savefig('%s/Test_%dc_bad_threshold.png'%(opt.opdict['fig_path'],len(opt.types)))
      plt.show()


    ### NON LINEAR SVM ###
    opt.opdict['method'] = 'svm_nl'
    classifier(opt)
    out_svm_nl = opt.out
    plot_2f_nonlinear(out_svm_nl,x_train,x_test,y_test,y_train=y_train,synth=True)
    #plt.savefig('%s/Test_%dc_%s_SVM_NL.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep))
    plt.show()

    ### COMPARE ALL 3 METHODS ON THE SAME PLOT ###
    plot_2f_synthetics(out_lr[0],x_train,x_test,y_test,out_comp=out_svm,y_train=y_train,map_nl=out_svm_nl)
    #plt.savefig('%s/Test_%dc_%s.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep))
    plt.show()
Beispiel #2
0
def plot_sep(opt):

    opt.set_params()
    opt.opdict['fig_path'] = os.path.join(opt.opdict['outdir'], 'figures')

    from do_classification import classifier
    ### LINEAR SVM ###
    opt.opdict['method'] = 'svm'
    classifier(opt)
    #opt.plot_PDFs()
    out_svm = opt.out
    print "SVM", out_svm['thetas']

    x_train = opt.train_x
    x_test = opt.x
    y_train = opt.train_y
    y_test = opt.y

    # *** Plot ***
    plot_2f_synthetics(out_svm, x_train, x_test, y_test, y_train=y_train)
    #plt.savefig('%s/Test_%dc_%s_SVM.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep))
    plt.show()

    ### LOGISTIC REGRESSION ###
    opt.opdict['method'] = 'lr'
    out_lr = {}
    for b in range(1):
        opt.opdict['learn_file'] = os.path.join(opt.opdict['libdir'],
                                                'LR_%d' % b)
        #os.remove(opt.opdict['learn_file'])
        classifier(opt)
        out_lr[b] = opt.out

    print "LR", out_lr[0]['thetas']

    # *** Plot ***
    if b == 0:
        print sorted(out_lr[0])
        plot_2f_synthetics(out_lr[0], x_train, x_test, y_test, y_train=y_train)
        #plt.savefig('%s/Test_%dc_%s_LR.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep))
        plt.show()

    else:
        plot_2f_synth_var(out_lr, x_train, x_test, y_test, opt.NB_test)
        #plt.savefig('%s/Test_%dc_LR_%s.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep))
        plt.show()

    if opt.opdict['plot_prec_rec']:
        plot_2f_synth_var(out_lr, x_train, x_test, y_test, opt.NB_test)
        #plt.savefig('%s/Test_%dc_bad_threshold.png'%(opt.opdict['fig_path'],len(opt.types)))
        plt.show()

    ### NON LINEAR SVM ###
    opt.opdict['method'] = 'svm_nl'
    classifier(opt)
    out_svm_nl = opt.out
    plot_2f_nonlinear(out_svm_nl,
                      x_train,
                      x_test,
                      y_test,
                      y_train=y_train,
                      synth=True)
    #plt.savefig('%s/Test_%dc_%s_SVM_NL.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep))
    plt.show()

    ### COMPARE ALL 3 METHODS ON THE SAME PLOT ###
    plot_2f_synthetics(out_lr[0],
                       x_train,
                       x_test,
                       y_test,
                       out_comp=out_svm,
                       y_train=y_train,
                       map_nl=out_svm_nl)
    #plt.savefig('%s/Test_%dc_%s.png'%(opt.opdict['fig_path'],len(opt.types),opt.sep))
    plt.show()
def classifier(opt):
  """
  Classification of the different types of events.
  opt is an object of the class Options()
  """

  list_attr = opt.__dict__.keys()
  if not 'x' in list_attr:
    opt.do_tri()

  X = opt.x
  Y = opt.y

  list_attr = opt.__dict__.keys()
  if 'train_x' in list_attr:
    X_TRAIN = opt.train_x
    Y_TRAIN = opt.train_y

  dic_results = {}
  for isc in sorted(opt.xs):

    print "==========",opt.trad[isc],"=========="
    subdic = {}

    if isc > 0:
      if opt.trad[isc][0] == sta_prev:
        marker_sta = 1
      else:
        marker_sta = 0
        sta_prev = opt.trad[isc][0]
    else:
      marker_sta = 0
      sta_prev = opt.trad[isc][0]

    if len(opt.xs[isc]) == 0:
      continue


    # About the training set
    if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1 and 'train_x' not in list_attr:
      if os.path.exists(opt.opdict['train_file']):
        print opt.opdict['train_file']
        TRAIN_Y = read_binary_file(opt.opdict['train_file'])
      else:
        TRAIN_Y = {}
        for tir in range(opt.opdict['boot']):
          TRAIN_Y[tir] = {}
    elif 'train_x' in list_attr:
      opt.x = opt.xs_train[isc]
      opt.y = opt.ys_train[isc]
      if opt.opdict['plot_pdf']:
        opt.compute_pdfs()
        g_train = opt.gaussians
        del opt.gaussians
      opt.classname2number()
      x_ref_train = opt.x
      y_ref_train = opt.y


    # About the test set
    opt.x = opt.xs[isc]
    opt.y = opt.ys[isc]
    if opt.opdict['plot_pdf']:
      opt.compute_pdfs()
 
    set = pd.DataFrame(index=opt.ys[isc].index,columns=['Otime'])
    set['Otime'] = opt.xs[isc].index

    opt.classname2number()
    x_test = opt.x
    y_ref = opt.y
    x_ref = opt.x

    if opt.opdict['plot_dataset']:
      opt.composition_dataset()

    #K = len(opt.types)

    ### ITERATE OVER TRAINING SET DRAWS ###
    for b in range(opt.opdict['boot']):
      print "\n-------------------- # iter: %d --------------------\n"%(b+1)

      subsubdic = {}
      print "WHOLE SET", x_ref.shape, y_ref.shape

      ### if there is no pre-defined training set ###
      if 'train_x' not in list_attr:
        x_train = x_test.copy()
        if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1:
          if len(TRAIN_Y[b]) > 0:
            y_train = y_ref.reindex(index=TRAIN_Y[b]['training_set'])
            y_train = y_train.dropna(how='any')
            y_cv = y_ref.reindex(index=TRAIN_Y[b]['cv_set'])
            y_cv = y_cv.dropna(how='any')
            y_test = y_ref.reindex(index=TRAIN_Y[b]['test_set'])
            y_test = y_test.dropna(how='any')
          else:
            y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref)
            TRAIN_Y[b]['training_set'] = map(int,list(y_train.index))
            TRAIN_Y[b]['cv_set'] = map(int,list(y_cv.index))
            TRAIN_Y[b]['test_set'] = map(int,list(y_test.index))

        ### multi-stations case ###
        else:
          if marker_sta == 0:
            y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref)
            list_ev_train = y_train.index
            list_ev_cv = y_cv.index
            list_ev_test = y_test.index
          else:
            y_train = y_ref.reindex(index=list_ev_train)
            y_train = y_train.dropna(how='any')
            y_cv = y_ref.reindex(index=list_ev_cv)
            y_cv = y_cv.dropna(how='any')
            y_test = y_ref.reindex(index=list_ev_test)
            y_test = y_test.dropna(how='any')

        x_train = x_ref.reindex(index=y_train.index)

      ### if a training set was pre-defined ###
      else:
        x_train = x_ref_train.copy()
        y_train = y_ref_train.copy()
        y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref,y_train=y_train)

      x_cv = x_ref.reindex(index=y_cv.index)
      x_test = x_ref.reindex(index=y_test.index)

      i_train = y_train.index
      x_train.index = range(x_train.shape[0])
      y_train.index = range(y_train.shape[0])
      print "TRAINING SET", x_train.shape, y_train.shape
      if x_train.shape[0] != y_train.shape[0]:
        print "Training set: Incoherence in x and y dimensions"
        sys.exit()

      i_cv = y_cv.index
      x_cv.index = range(x_cv.shape[0])
      y_cv.index = range(y_cv.shape[0])
      print "CROSS-VALIDATION SET", x_cv.shape, y_cv.shape
      if x_cv.shape[0] != y_cv.shape[0]:
        print "Cross-validation set: Incoherence in x and y dimensions"
        sys.exit()

      subsubdic['list_ev'] = np.array(y_test.index)

      i_test = y_test.index
      x_test.index = range(x_test.shape[0])
      y_test.index = range(y_test.shape[0])
      print "TEST SET", x_test.shape, y_test.shape
      if x_test.shape[0] != y_test.shape[0]:
        print "Test set: Incoherence in x and y dimensions"
        sys.exit()

      opt.train_x = x_train
      opt.x = x_test
      opt.train_y = y_train
      opt.y = y_test

      if opt.opdict['plot_pdf']:
        opt.plot_all_pdfs(save=opt.opdict['save_pdf'])
        if 'train_x' in list_attr:
          opt.plot_superposed_pdfs(g_train,save=opt.opdict['save_pdf'])
        else:
          opt.plot_all_pdfs(save=opt.opdict['save_pdf'])

      if opt.opdict['method'] == '1b1':
        # EXTRACTEURS
        print "********** EXTRACTION 1-BY-1 **********"
        opt.opdict['boot'] = 1
        one_by_one(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm')
        continue

      elif opt.opdict['method'] == 'ova':
        print "********** EXTRACTION 1-VS-ALL **********"
        opt.opdict['boot'] = 1
        one_vs_all(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm')
        continue

      elif opt.opdict['method'] in ['svm','svm_nl']:
        # SVM
        print "********** SVM **********"
        if opt.opdict['method'] == 'svm':
          kern = 'Lin'
        else:
          kern = 'NonLin'

        out = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern=kern,proba=opt.opdict['probas'])

        if 'map' in sorted(out):
          opt.map = out['map']

        if 'thetas' in sorted(out):
          theta_vec = out['thetas']
          theta,threshold = {},{}
          for it in range(len(theta_vec)):
            theta[it+1] = np.append(theta_vec[it][-1],theta_vec[it][:-1])
            threshold[it+1] = 0.5
          out['thetas'] = theta
          out['threshold'] = threshold

      elif opt.opdict['method'] == 'lrsk':
        # LOGISTIC REGRESSION (scikit learn)
        print "********* Logistic regression (sklearn) **********"
        out = implement_lr_sklearn(x_train,x_test,y_train,y_test)
        threshold, theta = {},{}
        for it in range(len(out['thetas'])):
          threshold[it+1] = 0.5
          theta[it+1] = np.append(out['thetas'][it][-1],out['thetas'][it][:-1])
        out['threshold'] = threshold
        out['thetas'] = theta

      elif opt.opdict['method'] == 'lr':
        # LOGISTIC REGRESSION
        print "********* Logistic regression **********"
        from LR_functions import do_all_logistic_regression
        out = do_all_logistic_regression(x_train,x_test,x_cv,y_train,y_test,y_cv)
        theta = out['thetas']
        threshold = out['threshold']
        if 'learn_file' in sorted(opt.opdict):
          learn_filename = opt.opdict['learn_file']
          if not os.path.exists(learn_filename):
            wtr = write_binary_file(learn_filename,i_train)

      CLASS_test = out['label_test']
      CLASS_train = out['label_train']

      # TRAINING SET
      print "\t *TRAINING SET"
      y_train_np = y_train.NumType.values.ravel()  
      from sklearn.metrics import confusion_matrix
      cmat_train = confusion_matrix(y_train_np,CLASS_train)
      p_tr = dic_percent(cmat_train,opt.types,verbose=True)
      out['rate_train'] = p_tr
      print "   Global : %.2f%%"%p_tr['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_train,opt.types,'Training',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/training_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)

      # TEST SET
      print "\t *TEST SET"
      y_test_np = y_test.NumType.values.ravel()
      cmat_test = confusion_matrix(y_test_np,CLASS_test)
      p_test = dic_percent(cmat_test,opt.types,verbose=True)
      out['rate_test'] = p_test
      print "   Global : %.2f%%"%p_test['global']
      if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']:
        plot_confusion_mat(cmat_test,opt.types,'Test',opt.opdict['method'].upper())
        if opt.opdict['save_confusion']:
          savefig = '%s/test_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file'])
          print "Confusion matrix saved in %s"%savefig
          plt.savefig(savefig)
        if opt.opdict['plot_confusion']:
          plt.show()
        else:
          plt.close()

      # PLOT PRECISION AND RECALL
      if opt.opdict['plot_prec_rec']:
        from LR_functions import normalize,plot_precision_recall
        x_train, x_test = normalize(x_train,x_test)
        plot_precision_recall(x_train,y_train.NumType,x_test,y_test.NumType,theta)

      pourcentages = (p_tr['global'],p_test['global'])
      out['method'] = opt.opdict['method']
      out['types'] = opt.types
      opt.out = out

      # PLOT DECISION BOUNDARIES
      n_feat = x_train.shape[1] # number of features
      if n_feat < 4:
        if opt.opdict['plot_sep'] or opt.opdict['save_sep']:
          print "\nPLOTTING"
          print "Theta values:",theta
          print "Threshold:", threshold

          # COMPARE AND PLOT LR AND SVM RESULTS
          out_svm, out_nl = {},{}
          dir = '%s_SEP'%opt.opdict['method'].upper()
          if opt.opdict['method']=='lr' and opt.opdict['compare']:
            dir = 'LR_SVM_SEP'
            out_svm = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='Lin')
            cmat_svm_tr = confusion_matrix(y_train_np,out_svm['label_train'])
            cmat_svm_test = confusion_matrix(y_test_np,out_svm['label_test'])
            svm_ptr = dic_percent(cmat_svm_tr,opt.types)
            svm_pt = dic_percent(cmat_svm_test,opt.types)
            theta_svm,t_svm = {},{}
            for it in range(len(out_svm['thetas'])):
              theta_svm[it+1] = np.append(out_svm['thetas'][it][-1],out_svm['thetas'][it][:-1])
              t_svm[it+1] = 0.5
            out_svm['thetas'] = theta_svm
            out_svm['threshold'] = t_svm
            out_svm['rate_test'] = svm_pt
            out_svm['rate_train'] = svm_ptr
            out_svm['method'] = 'SVM'

          if opt.opdict['method'] in ['lr','svm'] and opt.opdict['compare_nl']:
            dir = '%s_NL_SEP'%opt.opdict['method'].upper()
            out_nl = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='NonLin')
            cmat_svm_tr = confusion_matrix(y_train_np,out_nl['label_train'])
            cmat_svm_test = confusion_matrix(y_test_np,out_nl['label_test'])
            svm_ptr = dic_percent(cmat_svm_tr,opt.types)
            svm_pt = dic_percent(cmat_svm_test,opt.types)
            out_nl['rate_test'] = svm_pt
            out_nl['rate_train'] = svm_ptr
            out_nl['method'] = 'SVM_NL'

          save_dir = os.path.join(opt.opdict['fig_path'],dir)
          opt.verify_and_create(save_dir)

          from LR_functions import normalize
          x_train, x_test = normalize(x_train,x_test)

          x_train_good = x_train.reindex(index=y_train[y_train.NumType.values==CLASS_train].index)
          x_train_bad = x_train.reindex(index=y_train[y_train.NumType.values!=CLASS_train].index)
          good_train = y_train.reindex(index=x_train_good.index)

          x_test_good = x_test.reindex(index=y_test[y_test.NumType.values==CLASS_test].index)
          x_test_bad = x_test.reindex(index=y_test[y_test.NumType.values!=CLASS_test].index)

          # PLOT FOR 1 ATTRIBUTE AND 2 CLASSES
          if n_feat == 1 and len(opt.opdict['types']) == 2:
            name = opt.opdict['feat_list'][0]
            from plot_functions import plot_hyp_func_1f, histo_pdfs
            if opt.opdict['method']=='lr' and opt.opdict['compare']:
              plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,th_comp=theta_svm,cmat_test=cmat_test,cmat_svm=cmat_svm_test,cmat_train=cmat_train)
            else:
              #histo_pdfs(x_test,y_test,x_train=x_train,y_train=y_train)
              plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,cmat_test=cmat_test,cmat_train=cmat_train)

          # PLOT FOR 2 ATTRIBUTES AND 2 to 3 CLASSES
          elif n_feat == 2:
            name = '%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1])
            if opt.opdict['method'] in ['lr','svm']:
              from plot_2features import plot_2f_all
              plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad)
            elif opt.opdict['method']=='lr' and opt.opdict['compare']:
              from plot_2features import plot_2f_all
              plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad,out_comp=out_svm,map_nl=out_nl)
            elif opt.opdict['method'] == 'svm_nl':
              from plot_2features import plot_2f_nonlinear
              plot_2f_nonlinear(out,x_train,y_train,x_test,y_test,y_train=y_train)

          # PLOT FOR 3 ATTRIBUTES
          elif n_feat == 3:
            from plot_functions import plot_db_3d
            plot_db_3d(x_train,y_train.NumType,theta[1],title='Training set')
            plot_db_3d(x_test,y_test.NumType,theta[1],title='Test set')
            name = '%s_%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1],opt.opdict['feat_list'][2])

          if opt.opdict['save_sep']:
            savename = '%s/CL_sep_%s.png'%(save_dir,name)
            print "Figure saved in %s"%savename
            plt.savefig(savename)
          if opt.opdict['plot_sep']:
            plt.show()
          else:
            plt.close()

      # WRITE RESULTS INTO A DICTIONARY
      subsubdic['%'] = pourcentages
      trad_CLASS_test = []
      for i in CLASS_test:
        i = int(i)
        trad_CLASS_test.append(opt.types[i])
      subsubdic['classification'] = trad_CLASS_test
      if opt.opdict['probas']:
        subsubdic['proba'] = out['probas']
      if opt.opdict['plot_var']:
        subsubdic['out'] = out
      subdic[b] = subsubdic

    if opt.opdict['plot_var'] and opt.opdict['method'] in ['lr','svm','lrsk'] and n_feat==2 and len(opt.opdict['types'])==2:
      from plot_2features import plot_2f_variability
      plot_2f_variability(subdic,x_train,y_train,x_test,y_test)
      plt.savefig('%s/%s_variability_pas.png'%(opt.opdict['fig_path'],opt.opdict['method'].upper()))
      plt.show()


    dic_results[opt.trad[isc]] = subdic

  dic_results['header'] = {}
  dic_results['header']['features'] = opt.opdict['feat_list']
  dic_results['header']['types'] = opt.opdict['types']
  dic_results['header']['catalog'] = opt.opdict['label_test']

  if opt.opdict['method'] in ['lr','lrsk','svm','svm_nl']:
    print "Save results in file %s"%opt.opdict['result_path']
    write_binary_file(opt.opdict['result_path'],dic_results)

  if 'train_file' in sorted(opt.opdict):
    if not os.path.exists(opt.opdict['train_file']) and opt.opdict['boot'] > 1:
      write_binary_file(opt.opdict['train_file'],TRAIN_Y)