forked from jamesrobertlloyd/automl-phase-2
/
run.py
executable file
·424 lines (375 loc) · 18.8 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#!/usr/bin/python
from __future__ import division
#############################
# ChaLearn AutoML challenge #
#############################
# Usage: python run.py input_dir output_dir
# This sample code can be used either
# - to submit RESULTS deposited in the res/ subdirectory or
# - as a template for CODE submission.
#
# The input directory input_dir contains 5 subdirectories named by dataset,
# including:
# dataname/dataname_feat.type -- the feature type "Numerical", "Binary", or "Categorical" (Note: if this file is abscent, get the feature type from the dataname.info file)
# dataname/dataname_public.info -- parameters of the data and task, including metric and time_budget
# dataname/dataname_test.data -- training, validation and test data (solutions/target values are given for training data only)
# dataname/dataname_train.data
# dataname/dataname_train.solution
# dataname/dataname_valid.data
#
# The output directory will receive the predicted values (no subdirectories):
# dataname_test_000.predict -- Provide predictions at regular intervals to make sure you get some results even if the program crashes
# dataname_test_001.predict
# dataname_test_002.predict
# ...
# dataname_valid_000.predict
# dataname_valid_001.predict
# dataname_valid_002.predict
# ...
#
# Result submission:
# =================
# Search for @RESULT to locate that part of the code.
# ** Always keep this code. **
# If the subdirectory res/ contains result files (predicted values)
# the code just copies them to the output and does not train/test models.
# If no results are found, a model is trained and tested (see code submission).
#
# Code submission:
# ===============
# Search for @CODE to locate that part of the code.
# ** You may keep or modify this template or subtitute your own code. **
# The program saves predictions regularly. This way the program produces
# at least some results if it dies (or is terminated) prematurely.
# This also allows us to plot learning curves. The last result is used by the
# scoring program.
# We implemented 2 classes:
# 1) DATA LOADING:
# ------------
# Use/modify
# D = DataManager(basename, input_dir, ...)
# to load and preprocess data.
# Missing values --
# Our default method for replacing missing values is trivial: they are replaced by 0.
# We also add extra indicator features where missing values occurred. This doubles the number of features.
# Categorical variables --
# The location of potential Categorical variable is indicated in D.feat_type.
# NOTHING special is done about them in this sample code.
# Feature selection --
# We only implemented an ad hoc feature selection filter efficient for the
# dorothea dataset to show that performance improves significantly
# with that filter. It takes effect only for binary classification problems with sparse
# matrices as input and unbalanced classes.
# 2) LEARNING MACHINE:
# ----------------
# Use/modify
# M = MyAutoML(D.info, ...)
# to create a model.
# Number of base estimators --
# Our models are ensembles. Adding more estimators may improve their accuracy.
# Use M.model.n_estimators = num
# Training --
# M.fit(D.data['X_train'], D.data['Y_train'])
# Fit the parameters and hyper-parameters (all inclusive!)
# What we implemented hard-codes hyper-parameters, you probably want to
# optimize them. Also, we made a somewhat arbitrary choice of models in
# for the various types of data, just to give some baseline results.
# You probably want to do better model selection and/or add your own models.
# Testing --
# Y_valid = M.predict(D.data['X_valid'])
# Y_test = M.predict(D.data['X_test'])
#
# ALL INFORMATION, SOFTWARE, DOCUMENTATION, AND DATA ARE PROVIDED "AS-IS".
# ISABELLE GUYON, CHALEARN, AND/OR OTHER ORGANIZERS OR CODE AUTHORS DISCLAIM
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY PARTICULAR PURPOSE, AND THE
# WARRANTY OF NON-INFRIGEMENT OF ANY THIRD PARTY'S INTELLECTUAL PROPERTY RIGHTS.
# IN NO EVENT SHALL ISABELLE GUYON AND/OR OTHER ORGANIZERS BE LIABLE FOR ANY SPECIAL,
# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF SOFTWARE, DOCUMENTS, MATERIALS,
# PUBLICATIONS, OR INFORMATION MADE AVAILABLE FOR THE CHALLENGE.
#
# Main contributors: Isabelle Guyon and Arthur Pesah, March-October 2014
# Originally inspired by code code: Ben Hamner, Kaggle, March 2013
# Modified by Ivan Judson and Christophe Poulain, Microsoft, December 2013
import glob
import logging
import subprocess
import sys
import signal
import os
import datetime
# Hack to get around lack of output bug - sends stderr and stdout directly to the relevant files
pid = os.getpid()
fds = subprocess.check_output("ls -l /proc/%d/fd" % pid, shell=True)
lines = fds.splitlines()
stderr_fd, stdout_fd = None, None
for line in lines:
fields = line.split()
if 'stderr' in fields[-1]:
stderr_fd = int(fields[-3])
if 'stdout' in fields[-1]:
stdout_fd = int(fields[-3])
sys.stdout.flush()
if stdout_fd and stdout_fd != 1:
os.dup2(stdout_fd, 1)
sys.stderr.flush()
if stderr_fd and stderr_fd != 2:
os.dup2(stderr_fd, 2)
print "Check open file descriptors"
print fds
# end hack
# =========================== BEGIN USER OPTIONS ==============================
# Verbose mode:
##############
# Recommended to keep verbose = True: shows various progression messages
verbose = True # outputs messages to stdout and stderr for debug purposes
# ZIP your code
###############
# You can create a code submission archive, ready to submit, with zipme = True.
# This is meant to be used on your LOCAL server.
# zipme = False # use this flag to enable zipping of your code submission
zipme = True # use this flag to enable zipping of your code submission
the_date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
submission_filename = 'phase2_submission_' + the_date
# I/O defaults
##############
# Use default location for the input and output data:
# If no arguments to run.py are provided, this is where the data will be found
# and the results written to. Change the root_dir to your local directory.
root_dir = os.path.dirname(__file__)
# default_input_dir = os.path.join(root_dir, '..', 'data', 'dsss_bin_class_fold_01')
default_input_dir = os.path.join(root_dir, '..', 'data', 'phase_0')
# default_input_dir = os.path.join(root_dir, '..', 'data', 'phase_1')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dss_bin_class_fold_01', 'cv_rf_msl')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dss_bin_class_fold_01', 'cv_rf_gbm')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dsss_bin_class_fold_01', 'automl')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dsss_bin_class_fold_01', 'automl_rf')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dsss_bin_class_fold_01', 'ft_cv_rf')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dsss_bin_class_fold_01', 'ft_cv_rf_gbm')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dsss_bin_class_fold_01', 'ft_cv_rf_gbm_v2')
default_output_dir = os.path.join(root_dir, 'output')
# =========================== END USER OPTIONS ================================
# Version of the sample code
# Change in 1.1: time is measured by time.time(), not time.clock(): we keep track of wall time
# Change in 1.2: James Robert Lloyd taking control of code
# Change in 1.3: Prediction code run in separate process to keep an eye of memory and time
version = 1.3
# General purpose functions
from sys import argv, path
import time
overall_start = time.time()
# Our directories
# Note: On codalab, there is an extra sub-directory called "program"
running_on_codalab = False
run_dir = os.path.abspath(".")
codalab_run_dir = os.path.join(run_dir, "program")
if os.path.isdir(codalab_run_dir):
run_dir = codalab_run_dir
running_on_codalab = True
print "Running on Codalab!"
lib_dir = os.path.join(run_dir, "lib")
res_dir = os.path.join(run_dir, "res")
# Our libraries
path.append(run_dir)
path.append(lib_dir)
from automl_lib import data_io # general purpose input/output functions
from automl_lib.data_io import vprint # print only in verbose mode
from multiprocessing import Process
import psutil
import constants
import managers
import util
import traceback
import experiment
# Define function to be called to start process
def run_automl(input_dir, output_dir, data_name, time_budget, running_on_codalab):
print('input_dir = "%s"' % input_dir)
print('output_dir = "%s"' % output_dir)
print('data_name = "%s"' % data_name)
print('time_budget = %s' % time_budget)
try:
# automl.data_doubling_rf(input_dir, output_dir, data_name, time_budget, 20)
# automl.cv_growing_rf(input_dir, output_dir, data_name, time_budget)
# automl.cv_growing_rf_gbm(input_dir, output_dir, data_name, time_budget)
# automl.competition_example(input_dir, output_dir, data_name, time_budget)
# automl.competition_example_only_rf(input_dir, output_dir, data_name, time_budget)
# automl.freeze_thaw_cv_rf(input_dir, output_dir, data_name, time_budget)
# automl.freeze_thaw_cv_rf_gbm(input_dir, output_dir, data_name, time_budget, compute_quantum=10)
# automl.automl_phase_0(input_dir, output_dir, data_name, time_budget)
# mgr = managers.FixedLearnersFreezeThawManager(input_dir=input_dir, output_dir=output_dir,
# basename=data_name, time_budget=time_budget,
# compute_quantum=None, plot=not running_on_codalab, min_mem=4,
# n_folds=5)
exp = dict()
exp = experiment.exp_param_defaults(exp)
mgr = managers.FixedLearnersStackingManager(input_dir=input_dir, output_dir=output_dir,
basename=data_name, time_budget=time_budget,
compute_quantum=None, plot=not running_on_codalab,
n_folds=5,
overhead_memory=constants.OVERHEAD,
cgroup_soft_limit=constants.CGROUP_SOFT_LIMIT,
cgroup_hard_limit=constants.CGROUP_HARD_LIMIT,
exp=exp)
mgr.communicate()
except:
traceback.print_exc()
# =========================== BEGIN PROGRAM ================================
if __name__ == "__main__":
# Show library version and directory structure
if running_on_codalab:
data_io.show_version()
data_io.show_dir(run_dir)
# ### Check whether everything went well (no time exceeded)
execution_success = True
# ### INPUT/OUTPUT: Get input and output directory names
if len(argv) == 1: # Use the default input and output directories if no arguments are provided
input_dir = default_input_dir
output_dir = default_output_dir
else:
input_dir = argv[1]
output_dir = os.path.abspath(argv[2])
# Move old results and create a new output directory
data_io.mvdir(output_dir, output_dir+'_'+the_date)
data_io.mkdir(output_dir)
# ### INVENTORY DATA (and sort dataset names alphabetically)
datanames = data_io.inventory_data(input_dir)
# ==================== @RESULT SUBMISSION (KEEP THIS) =====================
# Always keep this code to enable result submission of pre-calculated results
# deposited in the res/ subdirectory.
if len(datanames) > 0:
vprint(verbose, "************************************************************************")
vprint(verbose, "****** Attempting to copy files (from res/) for RESULT submission ******")
vprint(verbose, "************************************************************************")
OK = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE!
if OK:
vprint(verbose, "[+] Success")
datanames = [] # Do not proceed with learning and testing
else:
vprint(verbose, "======== Some missing results on current datasets!")
vprint(verbose, "======== Proceeding to train/test:\n")
# =================== End @RESULT SUBMISSION (KEEP THIS) ==================
if zipme and not running_on_codalab:
vprint(verbose, "========= Zipping this directory to prepare for submit ==============")
ignoredirs = [os.path.abspath(x) for x in glob.glob('./output_*')]
data_io.zipdir(submission_filename + '.zip', ".",
ignoredirs=ignoredirs + [os.path.abspath('./compute_server')])
# ================ @CODE SUBMISSION (SUBSTITUTE YOUR CODE) =================
# DEBUG
print "Check for python processes from other people"
print subprocess.check_output("ps -eo pid,ppid,pgid,stime,state,user,%mem,command | grep python", shell=True)
available_mem = psutil.virtual_memory().available # measured in bytes
print('Available memory = %fMB' % (available_mem / (1024 * 1024)))
print "Find server's id"
print subprocess.check_output("ip addr show", shell=True)
# Set up logging
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)
class ProcessFilter(logging.Filter):
def filter(self, record):
"""Returns True for messages that should be printed"""
if record.processName == "Manager" or record.processName == 'MainProcess':
return True
else:
return False
form = logging.Formatter("[%(levelname)s/%(processName)s] %(asctime)s %(message)s")
# Handler for logging to stdout
# sh = logging.StreamHandler(stream=sys.stdout)
# sh.setLevel(logging.DEBUG) # set level here
# sh.addFilter(ProcessFilter()) # filter to show only logs from manager
# sh.setFormatter(form)
# root_logger.addHandler(sh)
# Handler for logging to stderr
if constants.LOGFILE:
util.move_make_file(constants.LOGFILE)
fh = logging.handlers.RotatingFileHandler(constants.LOGFILE, maxBytes=512*1024*1024)
fh.setLevel(logging.DEBUG)
fh.setFormatter(form)
root_logger.addHandler(fh)
else:
sh = logging.StreamHandler(stream=sys.stderr)
sh.setLevel(logging.WARN) # set level here
sh.addFilter(ProcessFilter()) # filter to show only logs from manager
sh.setFormatter(form)
root_logger.addHandler(sh)
# Create control group to limit resources
try:
subprocess.check_call(['which', 'cgexec'])
except subprocess.CalledProcessError:
# Install cgroup-bin
password = os.environ.get('PW', '') # Required to create and manage control groups
installcg = "echo '{}' | sudo -S apt-get -y install cgroup-bin".format(password)
while True:
retcode = subprocess.call(installcg, shell=True)
if retcode != 0:
root_logger.error("Cgroup-bin installation failed")
time.sleep(1)
else:
root_logger.info("Installed cgroup-bin")
break
for basename in datanames:
print('Processing dataset : ' + basename)
# Keep track of time
start_time = time.time()
# Write a file to record start time
open(os.path.join(output_dir, basename + '.firstpost'), 'wb').close()
print('\nStarting\n')
# Read time budget
with open(os.path.join(input_dir, basename, basename + '_public.info'), 'r') as info_file:
for line in info_file:
if line.startswith('time_budget'):
time_budget = int(line.split('=')[-1])
# Debug code
# time_budget = 120
print('Time budget = %ds' % time_budget)
root_logger.info('Time budget = %ds, dataset %s', time_budget, basename)
# Start separate process to analyse file
p = Process(target=run_automl, args=(input_dir,
output_dir,
basename,
time_budget - (time.time() - start_time) - 20,
running_on_codalab))
p.name = 'Manager'
p.start()
# Monitor the process, checking to see if it is complete or if total memory usage too high
while True:
time.sleep(0.2)
if p.is_alive():
available_mem = psutil.virtual_memory().available # measured in bytes
print('Available memory = %fMB' % (available_mem / (1024 * 1024)))
if available_mem < constants.OVERHEAD / 2:
print('Less than %.1f GB memory available - aborting process' %
(constants.OVERHEAD / float(2 ** 30)))
psutil.Process(pid=p.pid).send_signal(sig=signal.SIGTERM) # tidy up then die please
p.join(timeout=10) # give it a while to respond to signal
if p.is_alive():
util.murder_family(p.pid, killall=True, sig=signal.SIGKILL)
p.join()
print('Process %d terminated' % p.pid)
break
if (time.time() - start_time) > (time_budget - 15):
print('Time limit approaching - terminating')
psutil.Process(pid=p.pid).send_signal(sig=signal.SIGTERM)
p.join(timeout=10)
if p.is_alive():
util.murder_family(p.pid, killall=True, sig=signal.SIGKILL)
p.join()
print('Process %d terminated' % p.pid)
break
else:
print('Remaining time budget = %s' % (time_budget - (time.time() - start_time)))
root_logger.info('Remaining time = %ds, dataset %s',
(time_budget - (time.time() - start_time)), basename)
else:
print('Process terminated of its own accord')
break
print('\nFinished %s\n' % basename)
sys.stderr.write('Finished %s\n\n' % basename)
# DEBUG - want to see if there's any extra pythons running
print "Check for python processes"
print subprocess.check_output("ps -eo pid,ppid,pgid,stime,state,user,%mem,command | grep python", shell=True)
if running_on_codalab:
if execution_success:
exit(0)
else:
exit(1)