Ejemplo n.º 1
0
def test3():
    # Display progress logs on stdout
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    # Load some categories from the training set
    categories = [
        'alt.atheism',
        'talk.religion.misc',
    ]
    # Uncomment the following to do the analysis on all the categories
    #categories = None

    print("Loading 20 newsgroups dataset for categories:")
    print(categories)

    data = fetch_20newsgroups(subset='train', categories=categories)
    print("%d documents" % len(data.filenames))
    print("%d categories" % len(data.target_names))
    print()

    # define a pipeline combining a text feature extractor with a simple
    # classifier
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ])

    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    parameters = {
        'vect__max_df': ['float', [0.5, 1.]],
        #'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ['cat', [(1, 1), (1, 2)]],  # unigrams or bigrams
        #'tfidf__use_idf': (True, False),
        'tfidf__norm': ['cat', ('l1', 'l2')],
        'clf__alpha': ['float', [0.000001, 0.0001]],
        'clf__penalty': ['cat', ['l2', 'elasticnet']]
        #'clf__n_iter': (10, 50, 80),
    }

    search = SmartSearch(parameters,
                         estimator=pipeline,
                         X=data.data,
                         y=data.target,
                         n_iter=30)
    search._fit()
Ejemplo n.º 2
0
def test2():
    parameters = {
        'kernel': ['cat', ['rbf', 'poly']],
        'd': ['int', [1, 3]],
        'C': ['float', [1, 10]]
    }

    def scoring_function(x):
        return [0.5]

    search = SmartSearch(parameters,
                         model='GP',
                         estimator=scoring_function,
                         n_iter=15,
                         n_init=10,
                         n_final_iter=3)
    search._fit()
Ejemplo n.º 3
0
def test1():
    iris = load_digits()
    X, y = iris.data, iris.target
    clf = RandomForestClassifier(n_estimators=20)

    # specify parameters
    parameters = {
        "max_depth": ['int', [3, 3]],
        "max_features": ['int', [1, 11]],
        "min_samples_split": ['int', [1, 11]],
        "min_samples_leaf": ['int', [1, 11]],
        "bootstrap": ['cat', [True, False]],
        "criterion": ['cat', ["gini", "entropy"]]
    }

    search = SmartSearch(parameters, estimator=clf, X=X, y=y, n_iter=20)
    search._fit()
Ejemplo n.º 4
0
def runExperiment(first_exp,
                  n_exp,
                  parameters,
                  model = 'GCP',
                  n_random_init = 10,
                  n_total_iter = 30,
                  n_candidates=500,
                  corr_kernel='squared_exponential',
                  acquisition_function = 'UCB',
                  n_clusters = 1,
                  cluster_evol = 'constant',
                  GCP_mapWithNoise=False,
                  GCP_useAllNoisyY=False,
                  model_noise=None):
  
  last_exp = first_exp + n_exp
  print 'Run experiment',first_exp,'to',last_exp

  # Load data
  output = []
  f =open(("scoring_function/output.csv"),'r')
  for l in f:
      l = l[1:-3]
      string_l = l.split(',')
      output.append( [ float(i) for i in string_l] )
  f.close()
  print 'Loaded output file,',len(output),'rows'

  params = np.genfromtxt(("scoring_function/params.csv"),delimiter=',')
  print 'Loaded parameters file, shape :',params.shape

  KNN = NearestNeighbors()
  KNN.fit(params)
  # KNN.kneighbors(p,1,return_distance=False)[0]

  # function that retrieves a performance evaluation from the stored results
  def get_cv_res(p_dict):
      p = np.zeros(len(parameters))
      for k in p_dict.keys():
        p[int(k)] = p_dict[k]
      idx = KNN.kneighbors(p,1,return_distance=False)[0]
      all_o = output[idx]
      r = np.random.randint(len(all_o)/5)
      return all_o[(5*r):(5*r+5)]


  ###  Run experiment  ### 

  for n_exp in range(first_exp,last_exp):
      print ' ****   Run exp',n_exp,'  ****'
      ### set directory
      if not os.path.exists("exp_results/exp"+str(n_exp)):
          os.mkdir("exp_results/exp"+str(n_exp))
      else:
          print('Warning : directory already exists')

      search = SmartSearch(parameters,
                        estimator = get_cv_res,
                        corr_kernel = corr_kernel ,
                        GCP_mapWithNoise=GCP_mapWithNoise,
                        GCP_useAllNoisyY=GCP_useAllNoisyY,
                        model_noise = model_noise,
                        model = model, 
                        n_candidates = n_candidates,
                        n_iter = n_total_iter,
                        n_init = n_random_init, 
                        n_clusters = n_clusters,
                        cluster_evol = cluster_evol,
                        verbose = 2,
                        acquisition_function = acquisition_function,
                        detailed_res = 2)

      all_parameters, all_search_path, all_raw_outputs,all_mean_outputs = search._fit()

      ## save experiment's data
      for i in range(len(all_raw_outputs)):
          f =open(("exp_results/exp"+str(n_exp)+"/output_"+str(i)+".csv"),'w')
          for line in all_raw_outputs[i]:
              print>>f,line
          f.close()
          np.savetxt(("exp_results/exp"+str(n_exp)+"/param_"+str(i)+".csv"),all_parameters[i], delimiter=",")
          np.savetxt(("exp_results/exp"+str(n_exp)+"/param_path_"+str(i)+".csv"),all_search_path[i], delimiter=",")

      print ' ****   End experiment',n_exp,'  ****\n'
Ejemplo n.º 5
0
n_iter = 100
nb_iter_final = 0
acquisition_function = 'UCB'


def scoring_function(p_dict):
	x,y = p_dict['x'], p_dict['y']
	x = x -5.
	y= y
	return branin(x,y)


search = SmartSearch(parameters,
			estimator=scoring_function,
			corr_kernel = corr_kernel,
			acquisition_function = acquisition_function,
			GCP_mapWithNoise=mapWithNoise,
			model_noise = model_noise,
			model = sampling_model, 
			n_candidates=n_candidates,
			n_iter = n_iter,
			n_init = n_random_init,
			n_final_iter=nb_iter_final,
			n_clusters=n_clusters, 
			cluster_evol = cluster_evol,
			verbose=2,
			detailed_res = 0)

search._fit()

Ejemplo n.º 6
0
n_random_init = 15
n_iter = 100
nb_iter_final = 0
acquisition_function = 'UCB'


def scoring_function(p_dict):
    x, y = p_dict['x'], p_dict['y']
    x = x - 5.
    y = y
    return branin(x, y)


search = SmartSearch(parameters,
                     estimator=scoring_function,
                     corr_kernel=corr_kernel,
                     acquisition_function=acquisition_function,
                     GCP_mapWithNoise=mapWithNoise,
                     model_noise=model_noise,
                     model=sampling_model,
                     n_candidates=n_candidates,
                     n_iter=n_iter,
                     n_init=n_random_init,
                     n_final_iter=nb_iter_final,
                     n_clusters=n_clusters,
                     cluster_evol=cluster_evol,
                     verbose=2,
                     detailed_res=0)

search._fit()
Ejemplo n.º 7
0
def gp_vs_random_search(test_name, n_tests, search_lenght, save_data=False):
    """
	Compare GP-based search vs a simple random one
	Choose test_name in {'iris','text'}
	"""

    n_iter_search = search_lenght

    if (test_name == 'iris'):
        iris = load_digits()
        X, y = iris.data, iris.target
        pipeline = RandomForestClassifier()

        # specify parameters and distributions to sample from
        parameters = {
            "max_depth": ['int', [3, 3]],
            "max_features": ['int', [1, 11]],
            "min_samples_split": ['int', [1, 11]],
            "min_samples_leaf": ['int', [1, 11]],
            "bootstrap": ['cat', [True, False]],
            "criterion": ['cat', ["gini", "entropy"]]
        }

    elif (test_name == 'text'):
        # Display progress logs on stdout
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s %(levelname)s %(message)s')

        # Load some categories from the training set
        categories = [
            'alt.atheism',
            'talk.religion.misc',
        ]
        # Uncomment the following to do the analysis on all the categories
        #categories = None
        print("Loading 20 newsgroups dataset for categories:")
        print(categories)

        data = fetch_20newsgroups(subset='train', categories=categories)
        print("%d documents" % len(data.filenames))
        print("%d categories" % len(data.target_names))

        X = data.data
        y = data.target

        # define a pipeline combining a text feature extractor with a simple
        # classifier
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', SGDClassifier()),
        ])

        # uncommenting more parameters will give better exploring power but will
        # increase processing time in a combinatorial way
        parameters = {
            'vect__max_df': ['float', [0.5, 1.]],
            #'vect__max_features': (None, 5000, 10000, 50000),
            'vect__ngram_range': ['cat', [(1, 1),
                                          (1, 2)]],  # unigrams or bigrams
            #'tfidf__use_idf': (True, False),
            #'tfidf__norm': ('l1', 'l2'),
            'clf__alpha': ['float', [0.000001, 0.00001]],
            'clf__penalty': ['cat', ['l2', 'elasticnet']]
            #'clf__n_iter': (10, 50, 80),
        }

    else:
        print('Dataset not available for test')

    # GP UCB search
    all_gp_ucb_results = []
    print 'GP_ucb search'
    for i in range(n_tests):
        ucb_search = SmartSearch(parameters,
                                 estimator=pipeline,
                                 X=X,
                                 y=y,
                                 acquisition_function='UCB',
                                 n_iter=n_iter_search,
                                 n_init=20,
                                 verbose=False)
        _, scores = ucb_search._fit()

        max_scores = [scores[0]]
        print 'Test', i, '-', len(scores), 'parameters tested'

        for j in range(1, len(scores)):
            max_scores.append(max(max_scores[j - 1], scores[j]))
        all_gp_ucb_results.append(extend_result(n_iter_search, max_scores))
    all_gp_ucb_results = np.asarray(all_gp_ucb_results)
    print all_gp_ucb_results.shape
    if (save_data):
        np.savetxt('gp_ucb_scores.csv', all_gp_ucb_results, delimiter=',')

    # # GP EI search
    # all_gp_ei_results = []
    # print 'GP_ei search'
    # for i in range(n_tests):
    # 	ei_search = SmartSearch(parameters,estimator=pipeline,X=X,y=y,
    # 						acquisition_function='EI',
    # 						n_iter=n_iter_search, n_init=20, verbose=False)
    # 	_,scores = ei_search._fit()

    # 	max_scores = [scores[0]]
    # 	print 'Test',i,'-',len(scores),'parameters tested'

    # 	for j in range(1,len(scores)):
    # 		max_scores.append(max(max_scores[j-1],scores[j]))
    # 	all_gp_ei_results.append(extend_result(n_iter_search,max_scores))
    # all_gp_ei_results = np.asarray(all_gp_ei_results)
    # print all_gp_ei_results.shape
    # if(save_data):
    # 	np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',')

    # Randomized search
    print 'Random search'
    all_random_results = []
    for i in range(n_tests):
        random_search = SmartSearch(parameters,
                                    estimator=pipeline,
                                    X=X,
                                    y=y,
                                    n_iter=n_iter_search,
                                    n_init=n_iter_search,
                                    verbose=False)
        _, scores = random_search._fit()

        max_scores = [scores[0]]
        print 'Test', i, '-', len(scores), 'parameters tested'

        for j in range(1, len(scores)):
            max_scores.append(max(max_scores[j - 1], scores[j]))
        all_random_results.append(extend_result(n_iter_search, max_scores))
    all_random_results = np.asarray(all_random_results)
    if (save_data):
        np.savetxt('rand_scores.csv', all_random_results, delimiter=',')

    plt.figure()
    # plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI')
    plt.plot(range(n_iter_search),
             np.mean(all_gp_ucb_results, axis=0),
             'b',
             label='GP-UCB')
    plt.plot(range(n_iter_search),
             np.mean(all_random_results, axis=0),
             'g',
             label='Random')
    plt.legend(loc=4)
    plt.title('Test GP vs Random on ' + test_name + ' dataset - Average on ' +
              str(n_tests) + ' trials')
    plt.xlabel('Iterations')
    plt.ylabel('Max CV performance')
    plt.show()