def _sanity_check(self, all_tasks): total_clf = 0 failed_clf = 0 Xt, Yt = mnist_reader.load_mnist(path=DATA_DIR, kind='t10k') Xt = preprocessing.StandardScaler().fit_transform(Xt.astype(float)) Xs, Ys = shuffle(Xt, Yt) num_dummy = 10 Xs = Xs[:num_dummy] Ys = [j for j in range(10)] valid_jobs = [] for v in all_tasks: clf_name = list(v.keys())[0] clf_par = list(v.values())[0] total_clf += 1 try: globals()[clf_name](**clf_par).fit(Xs, Ys) valid_jobs.append( PredictJob(clf_name, clf_par, self.num_repeat)) except Exception as e: failed_clf += 1 LOGGER.error( 'Can not create classifier "%s" with parameter "%s". Reason: %s' % (clf_name, clf_par, e)) LOGGER.info('%d classifiers to test, %d fail to create!' % (total_clf, failed_clf)) return valid_jobs
def run(self) -> None: while True: cur_job = self.pending_q.get() # type: PredictJob LOGGER.info( 'job received! repeat: %d classifier: "%s" parameter: "%s"' % (cur_job.num_repeat, cur_job.clf_name, cur_job.clf_par)) if cur_job.clf_name in globals(): try: acc = [] cur_job.start_time = now_int() for j in range(cur_job.num_repeat): cur_score = self.get_accuracy(cur_job.clf_name, cur_job.clf_par, j) acc.append(cur_score) if len(acc) == 2 and abs(acc[0] - cur_score) < 1e-3: LOGGER.info( '%s is invariant to training data shuffling, will stop repeating!' % cur_job.clf_name) break cur_job.done_time = now_int() test_info = { 'name': cur_job.clf_name, 'parameter': cur_job.clf_par, 'score': acc, 'start_time': cur_job.start_time, 'done_time': cur_job.done_time, 'num_repeat': len(acc), 'mean_accuracy': np.array(acc).mean(), 'std_accuracy': np.array(acc).std() * 2, 'time_per_repeat': int((cur_job.done_time - cur_job.start_time) / len(acc)) } JSON_LOGGER.info(json.dumps(test_info, sort_keys=True)) LOGGER.info( 'done! acc: %0.3f (+/- %0.3f) repeated: %d classifier: "%s" ' 'parameter: "%s" ' % (np.array(acc).mean(), np.array(acc).std() * 2, len(acc), cur_job.clf_name, cur_job.clf_par)) except Exception as e: LOGGER.error('%s with %s failed! reason: %s' % (cur_job.clf_name, cur_job.clf_par, e)) else: LOGGER.error( 'Can not found "%s" in scikit-learn, missing import?' % cur_job.clf_name)
def run(self) -> None: while True: cur_job = self.pending_q.get() # type: PredictJob LOGGER.info('job received! repeat: %d classifier: "%s" parameter: "%s"' % (cur_job.num_repeat, cur_job.clf_name, cur_job.clf_par)) if cur_job.clf_name in globals(): try: acc = [] cur_job.start_time = now_int() for j in range(cur_job.num_repeat): cur_score = self.get_accuracy(cur_job.clf_name, cur_job.clf_par, j) acc.append(cur_score) if len(acc) == 2 and abs(acc[0] - cur_score) < 1e-3: LOGGER.info('%s is invariant to training data shuffling, will stop repeating!' % cur_job.clf_name) break cur_job.done_time = now_int() test_info = { 'name': cur_job.clf_name, 'parameter': cur_job.clf_par, 'score': acc, 'start_time': cur_job.start_time, 'done_time': cur_job.done_time, 'num_repeat': len(acc), 'mean_accuracy': np.array(acc).mean(), 'std_accuracy': np.array(acc).std() * 2, 'time_per_repeat': int((cur_job.done_time - cur_job.start_time) / len(acc)) } JSON_LOGGER.info(json.dumps(test_info, sort_keys=True)) LOGGER.info('done! acc: %0.3f (+/- %0.3f) repeated: %d classifier: "%s" ' 'parameter: "%s" ' % (np.array(acc).mean(), np.array(acc).std() * 2, len(acc), cur_job.clf_name, cur_job.clf_par)) except Exception as e: LOGGER.error('%s with %s failed! reason: %s' % (cur_job.clf_name, cur_job.clf_par, e)) else: LOGGER.error('Can not found "%s" in scikit-learn, missing import?' % cur_job.clf_name)
def upload_result_s3(): LOGGER.info("Syncing data to S3...") with open(LOG_PATH, 'a', 1) as logfile: proc = subprocess.Popen("bash %s %s" % (SYNC_SCRIPT_PATH, RESULT_PATH), shell=True, stdin=subprocess.PIPE, stdout=logfile, stderr=logfile, cwd=ROOT_DIR, env=os.environ) # we have to wait until the training data is downloaded try: outs, errs = proc.communicate(timeout=SYNC_TIMEOUT) if outs: LOGGER.info(outs) if errs: LOGGER.error(errs) except subprocess.TimeoutExpired: proc.kill()
def _sanity_check(self, all_tasks): total_clf = 0 failed_clf = 0 Xt, Yt = mnist_reader.load_mnist(path=DATA_DIR, kind='t10k') Xt = preprocessing.StandardScaler().fit_transform(Xt) Xs, Ys = shuffle(Xt, Yt) num_dummy = 10 Xs = Xs[:num_dummy] Ys = [j for j in range(10)] valid_jobs = [] for v in all_tasks: clf_name = list(v.keys())[0] clf_par = list(v.values())[0] total_clf += 1 try: globals()[clf_name](**clf_par).fit(Xs, Ys) valid_jobs.append(PredictJob(clf_name, clf_par, self.num_repeat)) except Exception as e: failed_clf += 1 LOGGER.error('Can not create classifier "%s" with parameter "%s". Reason: %s' % (clf_name, clf_par, e)) LOGGER.info('%d classifiers to test, %d fail to create!' % (total_clf, failed_clf)) return valid_jobs
def _sanity_check(self, all_tasks): total_clf = 0 failed_clf = 0 newsgroups = fetch_20newsgroups(subset='train', shuffle=True, random_state=2019, remove=('headers', 'footers', 'quotes')) Xt, Yt = newsgroups.data, newsgroups.target Xs, Ys = shuffle(Xt, Yt) num_dummy = 10 Xs = Xs[:num_dummy] Ys = Ys[:num_dummy] valid_jobs = [] for v in all_tasks: processor_name = list(v[0].keys())[0] processor_par = list(v[0].values())[0] clf_name = list(v[1].keys())[0] clf_par = list(v[1].values())[0] topic_name = list(v[2].keys())[0] topic_par = list(v[2].values())[0] total_clf += 1 try: make_pipeline(globals()[processor_name](**processor_par), DenseTransformer(), globals()[topic_name](**topic_par), globals()[clf_name](**clf_par)).fit(Xs, Ys) valid_jobs.append( PredictJob(processor_name, processor_par, clf_name, clf_par, topic_name, topic_par, self.num_repeat)) except Exception as e: failed_clf += 1 LOGGER.error( 'Can not create classifier "%s" with parameter "%s". Reason: %s' % (clf_name, clf_par, e)) LOGGER.info('%d classifiers to test, %d fail to create!' % (total_clf, failed_clf)) return valid_jobs