def pre_run(): global tb, m, n, T, open_lim, loss_lim, g, U, M, dataFrame, values params = utils.cli_init_params() tb = params['tb'] m = params['m'] n = params['n'] T = params['T'] open_lim = params['open_lim'] loss_lim = params['loss_lim'] g = params['g'] U = params['U'] M = params['M'] #tb = '2018-08-20' #M = 2 #m = 3 #n = 5 #T = '2019-02-20' #open_lim = 1 #loss_lim = 3 #g = 0.2 #U = 1 dataFrame = utils.csv_open(DATA_CSV_FILE, tb=T, before=True, ftype='data') values = utils.csv_open(VALUE_CSV_FILE, tb=T, before=True, ftype='value') utils.xprint(os.linesep + 'Parameters initialized successfully!')
def build_lstm4(embeddings, shape, settings): model = Sequential() model.add( Embedding( embeddings.shape[0], embeddings.shape[1], input_length=shape['max_length'], trainable=False, weights=[embeddings], mask_zero=False, name='eembed' ) ) model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False, name='td4'))) model.add(Bidirectional(LSTM(shape['n_hidden'], return_sequences=True, recurrent_dropout=settings['dropout'], dropout=settings['dropout']))) model.add(Flatten(name='flaaten')) model.add(BatchNormalization()) n_dense = int(math.ceil(math.sqrt(shape['n_hidden'] * shape['n_class']))) model.add(Dense(n_dense, activation='relu')) # model.add(BatchNormalization()) # x = Dropout(dropout)(x) model.add(Dense(shape['n_class'], activation='sigmoid')) xprint('build_lstm4: embeddings=%s shape=%s' % (dim(embeddings), shape)) return model
def describe(y): """Return table of values min, mean, max """ MEASURES = ['min', 'mean', 'max'] stats = np.zeros((3, len(LABEL_COLS)), dtype=np.float64) xprint('stats=%s' % dim(stats)) for j, col in enumerate(LABEL_COLS): stats[0, j] = y[:, j].min() stats[1, j] = y[:, j].mean() stats[2, j] = y[:, j].max() def draw(name, vals, sep='|'): vals = ['%12s' % v for v in ([name] + vals)] xprint((' %s ' % sep).join(vals)) def draw_bar(): bar = '-' * 12 draw(bar, [bar] * len(LABEL_COLS), sep='+') draw_bar() draw('', LABEL_COLS) draw_bar() for i, measure in enumerate(MEASURES): draw(measure, ['%10.4f' % z for z in stats[i, :]]) draw_bar()
def run(): stk_pairs = strategy.filter_pairs(dataFrame, sequence=FILTER_SEQ, number=n) global resDataFrame resDataFrame = trade.trade(tb, m, open_lim, loss_lim, g, U, dataFrame, values, stk_pairs, M) utils.xprint(os.linesep + 'Trade processed successfully!')
def build_lstm9(embeddings, shape, settings): """2 layer LSTM """ model = Sequential() model.add( Embedding( embeddings.shape[0], embeddings.shape[1], input_length=shape['max_length'], trainable=False, weights=[embeddings], mask_zero=False ) ) model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False), name='td9a')) model.add(Bidirectional(LSTM(shape['n_hidden'], return_sequences=True, recurrent_dropout=settings['dropout'], dropout=settings['dropout']), name='bidi9a')) # model.add(GlobalMaxPool1D()) # model.add(BatchNormalization()) # model.add(Dropout(settings['dropout'] / 2.0)) # model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False), name='td9b')) model.add(Bidirectional(LSTM(shape['n_hidden'], return_sequences=True, recurrent_dropout=settings['dropout'], dropout=settings['dropout']), name='bidi9b')) model.add(GlobalMaxPool1D(name='mp9')) model.add(BatchNormalization(name='bn9')) model.add(Dropout(settings['dropout'] / 2.0, name='drop9b')) model.add(Dense(shape['n_class'], activation='sigmoid', name='den9b')) xprint('build_lstm9: embeddings=%s shape=%s' % (dim(embeddings), shape)) return model
def filter_pairs(dataFrame, sequence=['coint', 'AR1', 'distance'], number=0): get_clsprc = lambda y: [x[0] for x in dataFrame[pair[y]]] stk_pairs = utils.get_stkcd_pairs(dataFrame) stk_pairs = [(pair[0], pair[1], {}) for pair in stk_pairs] utils.xprint(os.linesep + 'Originally, %d pairs are created...' % (len(stk_pairs))) for pair in stk_pairs: pair[2]['bval'] = utils.Liner_Regression(get_clsprc(0), get_clsprc(1)) #time.sleep(SLEEP_DURATION) container = globals() foo_name = None for item in sequence: for var in container.keys(): if item in var: foo_name = var try: foo = container[foo_name] except KeyError, NameError: raise RuntimeError('Invalid filter method sequence') stk_pairs = foo(stk_pairs, dataFrame) utils.xprint('After %s, %d pairs left...' % (foo.__doc__, len(stk_pairs))) time.sleep(SLEEP_DURATION)
def fit(self, train, test_size=0.1): model_dir = get_model_dir(self.model_name, 0) # RocAucEvaluation saves the trainable part of the model model_path = os.path.join(model_dir, 'model') os.makedirs(model_dir, exist_ok=True) xprint('ClfCharLstm.fit: model_dir=%s' % model_dir) y_train = train[LABEL_COLS].values X_train = df_to_sentences(train) X_val, y_val = None, None if test_size > 0.0: X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size) lstm_shape = {'n_hidden': self.n_hidden, 'max_length': self.max_length, 'n_class': len(LABEL_COLS)} lstm_settings = {'dropout': self.dropout, 'lr': self.learn_rate} lstm, self.best_epochs = do_train(X_train, y_train, X_val, y_val, lstm_shape, lstm_settings, {}, epochs=self.epochs, batch_size=self.batch_size, frozen=self.frozen, lstm_type=self.lstm_type, model_path=model_path) with open(os.path.join(model_dir, 'config.json'), 'wt') as f: f.write(lstm.to_json()) print('****: best_epochs=%s - %s' % (self.best_epochs, self.description))
def evaluate(self, get_clf): auc = np.zeros((self.n, len(LABEL_COLS)), dtype=np.float64) for i in range(self.n): ok, auc[i, :] = self._evaluate(get_clf, i) if not ok: return ok, auc show_auc(auc[:i + 1, :]) xprint('program=%s train=%s' % (sys.argv[0], dim(self.train))) return True, auc
def do_train(train_texts, train_labels, dev_texts, dev_labels, lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, epochs=5, by_sentence=True, frozen=False, lstm_type=1, model_path=None): """Train a Keras model on the sentences in `train_texts` All the sentences in a text have the text's label """ print('do_train: train_texts=%s dev_texts=%s' % (dim(train_texts), dim(dev_texts))) embeddings, char_index, _ = get_char_embeddings() n_train_sents = count_sentences(char_index, train_texts, batch_size, 'train') X_train, y_train = make_char_sentences(char_index, lstm_shape['max_length'], batch_size, train_texts, train_labels, 'train', n_train_sents) validation_data = None if dev_texts is not None: n_dev_sents = count_sentences(char_index, dev_texts, batch_size, 'dev') X_val, y_val = make_char_sentences(char_index, lstm_shape['max_length'], batch_size, dev_texts, dev_labels, 'dev', n_dev_sents) validation_data = (X_val, y_val) sentence_cache.flush() model = build_lstm[lstm_type](embeddings, lstm_shape, lstm_settings) compile_lstm(model, lstm_settings['lr']) callback_list = None if validation_data is not None: ra_val = RocAucEvaluation(validation_data=validation_data, interval=1, frozen=frozen, model_path=model_path) early = EarlyStopping(monitor='val_auc', mode='max', patience=1, verbose=1) callback_list = [ra_val, early] model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=validation_data, callbacks=callback_list, verbose=1) best_epoch_frozen = ra_val.best_epoch ra_val.best_epoch = -1 best_epoch_unfrozen = -1 if not frozen: xprint("Unfreezing") for layer in model.layers: layer.trainable = True compile_lstm(model, lstm_settings['lr'] / 10) if validation_data is not None: # Reload the best model so far lstm_weights = [embeddings] + ra_val.top_weights model.set_weights(lstm_weights) # Reset early stopping early = EarlyStopping(monitor='val_auc', mode='max', patience=1, verbose=1) callback_list = [ra_val, early] model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=validation_data, callbacks=callback_list, verbose=1) best_epoch_unfrozen = ra_val.best_epoch return model, (best_epoch_frozen, best_epoch_unfrozen)
def split_data(df, indexes, frac): show_values('df', df) n = int(len(df) * frac) train = df.loc[indexes[:n]] test = df.loc[indexes[n:]] show_values('train', train) show_values('test', test) xprint('split_data: %.2f of %d: train=%d test=%d' % (frac, len(df), len(train), len(test))) return train, test
def load(cls, path, char_index, max_length, frozen): xprint('SentimentAnalyser.load: path=%s max_length=%d' % (path, max_length)) with open(os.path.join(path, 'config.json'), 'rt') as f: model = model_from_json(f.read()) with open(os.path.join(path, 'model'), 'rb') as f: lstm_weights = pickle.load(f) if frozen: embeddings, char_index, index_char = get_char_embeddings() lstm_weights = [embeddings] + lstm_weights model.set_weights(lstm_weights) return cls(char_index, model, max_length=max_length)
def make_submission_reductions(get_clf, submission_name, predict_methods): seed_random() os.makedirs(SUBMISSION_DIR, exist_ok=True) train, test, subm = load_data() clf = get_clf() clf.fit(train, test_size=0.0) reductions = clf.predict_reductions(test, predict_methods) ok = True for method in predict_methods: submission_path = join(SUBMISSION_DIR, '%s.%s.%s.csv' % ( submission_name, get_n_samples_str(), method)) if os.path.exists(submission_path): xprint('make_submission_reductions: submission_path=%s already exists' % submission_path) ok = False break xprint('make_submission_reduction: method=%s' % method) pred = reductions[method] describe(pred) # Create the submission file. submid = pd.DataFrame({'id': subm['id']}) submission = pd.concat([submid, pd.DataFrame(pred, columns=LABEL_COLS)], axis=1) submission.to_csv(submission_path, index=False) xprint('make_submission_reductions: Saved in %s' % submission_path) xprint('program=%s train=%s test=%s submission=%s' % (sys.argv[0], dim(train), dim(test), dim(submission))) if clf is not None: del clf return ok
def process_summary(path, n_rank): print('=' * 100) print('path=%s' % path) completed_tests = load_json(path) xprint('run_summary_path=%s' % path) best = {} try: best = display_results(completed_tests, do_max, n_rank) # display_results(completed_tests, True) except Exception as e: print('Bad summary: %s' % e) print('&' * 100) return best
def show_auc(auc): n = auc.shape[0] mean_auc = auc.mean(axis=0) auc_mean = auc.mean(axis=1) xprint('-' * 110, 'n=%d' % n) for i in range(n): xprint('%5d: auc=%.3f %s' % (i, auc[i, :].mean(), label_score(auc[i, :]))) xprint('%5s: auc=%.3f %s' % ('Mean', mean_auc.mean(), label_score(mean_auc))) xprint('-' * 110) xprint('auc=%.3f +- %.3f (%.0f%%) range=%.3f (%.0f%%)' % ( auc_mean.mean(), auc_mean.std(), 100.0 * auc_mean.std() / auc_mean.mean(), auc_mean.max() - auc_mean.min(), 100.0 * (auc_mean.max() - auc_mean.min()) / auc_mean.mean() ))
def fit(self, train, test_size=0.1): print('ClfSpacy.fit', '-' * 80) (model1_path, config1_path), (model2_path, config2_path), epoch_path = self._get_paths(True) if not self.force_fit: if self.frozen: if (os.path.exists(model1_path) and os.path.exists(config1_path) and SaveAllEpochs.epoch_dict(epoch_path)['epoch1'] == self.epochs): xprint('model1_path already exists. re-using') return else: if (os.path.exists(model2_path) and os.path.exists(config2_path) and SaveAllEpochs.epoch_dict(epoch_path)['epoch2'] == self.epochs2): xprint('model2_path already exists. re-using') return do_fit1 = (not (os.path.exists(model1_path) and os.path.exists(config1_path)) or SaveAllEpochs.epoch_dict(epoch_path)['epoch1'] < self.epochs) do_fit2 = (not self.frozen and (not (os.path.exists(model2_path) and os.path.exists(config2_path)) or SaveAllEpochs.epoch_dict(epoch_path)['epoch2'] < self.epochs2)) y_train = train[LABEL_COLS].values X_train = df_to_sentences(train) X_val, y_val = None, None if test_size > 0.0: X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size) lstm_shape = {'n_hidden': self.n_hidden, 'max_length': self.max_length, 'n_class': len(LABEL_COLS)} lstm_settings = {'dropout': self.dropout, 'lr': self.learn_rate} lstm, self.best_epochs = do_train(X_train, y_train, X_val, y_val, lstm_shape, lstm_settings, {}, batch_size=self.batch_size, lstm_type=self.lstm_type, do_fit1=do_fit1, epochs1=self.epochs, model1_path=model1_path, config1_path=config1_path, do_fit2=do_fit2, epochs2=self.epochs2, model2_path=model2_path, config2_path=config2_path, epoch_path=epoch_path) assert do_fit1 if do_fit1: assert os.path.exists(model1_path), model1_path assert os.path.exists(config1_path), config1_path if do_fit2: assert os.path.exists(model2_path), model2_path assert os.path.exists(config2_path), config2_path print('****: best_epochs=%s - %s Add 1 to these' % (self.best_epochs, self.description)) del lstm
def evaluate_reductions(self, get_clf, predict_methods): predict_methods_all = predict_methods + ['BEST'] auc_reductions = {method: np.zeros((self.n, len(LABEL_COLS)), dtype=np.float64) for method in predict_methods_all} best_methods = [] for i in range(self.n): ok, reductions, best = self._evaluate_reductions(get_clf, i, predict_methods) best_methods.append(best) if not ok: return ok, {}, best_methods for method in predict_methods_all: auc = auc_reductions[method] auci = reductions[method] auc[i, :] = auci print('evaluate_reductions: method=%s' % method) show_auc(auc[:i + 1, :]) xprint('program=%s train=%s' % (sys.argv[0], dim(self.train))) return True, auc_reductions, best_methods
def _get_paths(self, create_dir): model_dir = get_model_dir(self.model_name, 0) if create_dir: os.makedirs(model_dir, exist_ok=True) # RocAucEvaluation saves the trainable part of the model model1_path = os.path.join(model_dir, 'model') config1_path = os.path.join(model_dir, 'config.json') model2_path = os.path.join(model_dir, 'model2') config2_path = os.path.join(model_dir, 'config2.json') epoch_path = os.path.join(model_dir, 'epochs.json') if not self._shown_paths: xprint('model1_path=%s exists=%s' % (model1_path, os.path.exists(model1_path))) xprint('config1_path=%s exists=%s' % (config1_path, os.path.exists(config1_path))) xprint('model2_path=%s exists=%s' % (model2_path, os.path.exists(model2_path))) xprint('config2_path=%s exists=%s' % (config1_path, os.path.exists(config2_path))) xprint('epoch_path=%s exists=%s' % (epoch_path, os.path.exists(epoch_path))) self._shown_paths = True return (model1_path, config1_path), (model2_path, config2_path), epoch_path
def show_scores(scores, force=False): global scores_t0, scores_len if not force: if not scores or len(scores) == scores_len: return if time.clock() < scores_t0 + 60.0: return scores_t0 = time.clock() scores_len = len(scores) scores.sort(key=lambda x: (-x[0], x[2])) xprint('!' * 80) with open('all.results3.txt', 'wt') as f: for i, (score, col_scores, params, desc) in enumerate(scores): if i < 10: xprint('%4d: auc=%.3f %s %s %s' % (i, score, col_scores, params, desc)) print('%4d: auc=%.3f %s %s %s' % (i, score, col_scores, params, desc), file=f)
def test(): utils.xprint("Test arg processing ...", newline=True) try: DumpPrediction.process_command_line_args([ "-f", "'[case]2018-01-18-20-53-10'", "-e", "1", "-n", "'demo'", "-t", "'res/dump/demo'" ]) DumpPrediction.process_command_line_args([ "-f", "'[case]2018-01-18-20-53-10'", "-e", "1", "-n", "'demo'" ]) except: raise try: DumpPrediction.process_command_line_args( ["-e", "1", "-n", "'demo'", "-t", "'res/dump'"]) except ValueError, e: utils.xprint("""Exception correctly caught: "%s"...""" % e.message, newline=True)
def evaluate_params(evaluator, trial, n_hidden, dropout, max_features, learning_rate, maxlen, n_folds, embed_name, embed_size, n=1): def get_clf(): return ClfLstmGlove(n_hidden=n_hidden, embed_name=embed_name, embed_size=embed_size, maxlen=maxlen, max_features=max_features, dropout=dropout, epochs=epochs, learning_rate=learning_rate, n_folds=1) xprint('#' * 80) xprint(get_clf()) seed_random(seed=trial + 1000) xprint('evaluate_params(n_hidden=%d, dropout=%.3f, max_features=%d, learning_rate=%s' % ( n_hidden, dropout, max_features, learning_rate)) xprint(get_clf()) ok, auc = evaluator.evaluate(get_clf) xprint('=' * 80) return ok, auc, str(get_clf())
def build_lstm1(embeddings, shape, settings): model = Sequential() model.add( Embedding( embeddings.shape[0], embeddings.shape[1], input_length=shape['max_length'], trainable=False, weights=[embeddings], mask_zero=True ) ) model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False))) model.add(Bidirectional(LSTM(shape['n_hidden'], recurrent_dropout=settings['dropout'], dropout=settings['dropout']))) model.add(Dense(shape['n_class'], activation='sigmoid')) xprint('build_lstm1: embeddings=%s shape=%s' % (dim(embeddings), shape)) return model
def predict_reductions(self, test, predict_methods): print('ClfSpacy.predict', '-' * 80) X_test = df_to_sentences(test) (model1_path, config1_path), (model2_path, config2_path), _ = self._get_paths(False) frozen = self.frozen if not frozen and not (os.path.exists(model2_path) and os.path.exists(config2_path)): xprint('unfrozen but no improvement over frozen. Using frozen') frozen = True if frozen: model_path, config_path = model1_path, config1_path else: model_path, config_path = model2_path, config2_path assert os.path.exists(model_path), model_path assert os.path.exists(config_path), config_path return predict_reductions(model_path, config_path, frozen, X_test, methods=predict_methods, max_length=self.max_length)
def load_data(): train = pd.read_csv(join(TOXIC_DATA_DIR, 'train.csv')) test = pd.read_csv(join(TOXIC_DATA_DIR, 'test.csv')) subm = pd.read_csv(join(TOXIC_DATA_DIR, 'sample_submission.csv')) xprint('train,test,subm:', train.shape, test.shape, subm.shape) n_samples = get_n_samples() if n_samples > 0: train = train[:n_samples] test = test[:n_samples] seed_random() xprint('train=%d test=%d (%.1f%%)' % (len(train), len(test), 100.0 * len(test) / len(train))) # There are a few empty comments that we need to get rid of, otherwise sklearn will complain. train[COMMENT].fillna('_na_', inplace=True) test[COMMENT].fillna('_na_', inplace=True) return train, test, subm
def make_submission(get_clf, submission_name): seed_random() submission_path = join(SUBMISSION_DIR, '%s.%s.csv' % (submission_name, get_n_samples_str())) assert not os.path.exists(submission_path), submission_path os.makedirs(SUBMISSION_DIR, exist_ok=True) train, test, subm = load_data() clf = get_clf() clf.fit(train, test_size=0.0) pred = clf.predict(test) describe(pred) # Csreate the submission file. submid = pd.DataFrame({'id': subm['id']}) submission = pd.concat([submid, pd.DataFrame(pred, columns=LABEL_COLS)], axis=1) submission.to_csv(submission_path, index=False) xprint('Saved in %s' % submission_path) xprint('program=%s train=%s test=%s submission=%s' % (sys.argv[0], dim(train), dim(test), dim(submission)))
def build_lstm2(embeddings, shape, settings): # inp = Input(shape=(shape['max_length'],)) # x = Embedding( # embeddings.shape[0], # embeddings.shape[1], # input_length=shape['max_length'], # trainable=False, # weights=[embeddings], # mask_zero=True # )(inp) # x = Bidirectional(LSTM(shape['n_hidden'], # recurrent_dropout=settings['dropout'], # dropout=settings['dropout']))(x) # x = GlobalMaxPool1D()(x) # x = BatchNormalization()(x) # x = Dense(50, activation="relu")(x) # #x = BatchNormalization()(x) # x = Dropout(dropout)(x) # x = Dense(shape['n_class'], activation='sigmoid')(x) # model = Model(inputs=inp, outputs=x) model = Sequential() model.add( Embedding( embeddings.shape[0], embeddings.shape[1], input_length=shape['max_length'], trainable=False, weights=[embeddings], mask_zero=False ) ) model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False), name='td2')) model.add(Bidirectional(LSTM(shape['n_hidden'], return_sequences=True, recurrent_dropout=settings['dropout'], dropout=settings['dropout']))) model.add(GlobalMaxPool1D()) model.add(BatchNormalization()) model.add(Dense(shape['n_class'], activation='sigmoid')) xprint('build_lstm2: embeddings=%s shape=%s' % (dim(embeddings), shape)) return model
def pipe(self, docs, batch_size=1000, n_threads=-1): interval = 10 t0 = time.clock() i = 0 k = 0 for minibatch in cytoolz.partition_all(batch_size, docs): minibatch = list(minibatch) for doc in minibatch: Xs = get_features(doc.sents, self.max_length) ys = self._model.predict(Xs) if i >= interval: xprint('SentimentAnalyser.pipe: %4d docs %5d sents %.1f sec' % (i, k, time.clock() - t0)) interval *= 2 for method in self.methods: y = reduce(ys, method=method) assert len(y.shape) == 1 and len(y) == ys.shape[1], (ys.shape, y.shape) doc.user_data[method] = y yield doc i += 1 k += ys.shape[0] xprint('SentimentAnalyser.pipe: %4d docs %5d sents %.1f sec TOTAL' % (i, k, time.clock() - t0))
def dump_pan(frompath, nodename): try: filenames = [] if nodename is not None: this = '%s.trace' % nodename filenames += [this] else: filenames = utils.filer.list_directory(frompath, '.*\.trace') if len(filenames) == 0: raise ValueError( "No trace file is found under '%s' with node name '%s'." % (frompath, nodename)) for filename in filenames: nodename, _ = utils.filer.split_extension(filename) sampler = Sampler(path=frompath, nodes=[nodename], keep_positive=False) pan = sampler.pan_to_positive() panfile = '%s.pan' % nodename panfile = utils.filer.format_subpath(frompath, panfile, isfile=True) utils.filer.write(panfile, '%s\t%s' % (pan[0], pan[1]), mode='w') utils.xprint( "Pan information (%s) for node %s is dumped to '%s'." % (pan, nodename, panfile), newline=True) except: raise
def build_lstm8(embeddings, shape, settings): """Flatten rather than pool""" model = Sequential() model.add( Embedding( embeddings.shape[0], embeddings.shape[1], input_length=shape['max_length'], trainable=False, weights=[embeddings], mask_zero=False, name='eembed' ) ) model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False, name='td8'))) model.add(Bidirectional(LSTM(shape['n_hidden'], return_sequences=True, recurrent_dropout=settings['dropout'], dropout=settings['dropout']), name='bidi')) model.add(Flatten(name='flaaten')) model.add(BatchNormalization()) model.add(Dropout(settings['dropout'] / 2.0)) model.add(Dense(shape['n_class'], activation='sigmoid')) xprint('build_lstm8: embeddings=%s shape=%s' % (dim(embeddings), shape)) return model
def _evaluate(self, get_clf, i, do_clips=False): xprint('_evaluate %3d of %d %s' % (i, self.n, '-' * 66)) assert 0 <= i < len(self.shuffled_indexes), (i, self.n, len(self.shuffled_indexes)) train_part, test_part = split_data(self.train, self.shuffled_indexes[i], self.frac) CLIPS = [0.0, 1.0e-6, 1.0e-5, 1.0e-4, 1.0e-3, 1.0e-2, 0.1, 0.2, 0.3, 0.5, 0.8, 0.9] auc = np.zeros(len(LABEL_COLS), dtype=np.float64) clf = None try: clf = get_clf() t0 = time.clock() clf.fit(train_part) print('_evaluate %d fit duration=%.1f sec' % (i, time.clock() - t0)) t0 = time.clock() pred = clf.predict(test_part) print('_evaluate %d predict duration=%.1f sec' % (i, time.clock() - t0)) print('!!! _evaluate pred=%s' % dim(pred)) except Exception as e: xprint('!!! _evaluate, exception=%s' % e) return False, auc if do_clips: for k, delta in enumerate(CLIPS): auc = np.zeros(len(LABEL_COLS), dtype=np.float64) for j, col in enumerate(LABEL_COLS): y_true = test_part[col] y_pred = np.clip(pred[:, j], 0.0, 1.0 - delta) auc[j] = roc_auc_score(y_true, y_pred) mean_auc = auc.mean() xprint('%5d: %d: delta=%6g auc=%.5f %s' % (i, k, delta, mean_auc, label_score(auc))) auc = np.zeros(len(LABEL_COLS), dtype=np.float64) for j, col in enumerate(LABEL_COLS): y_true = test_part[col] y_pred = pred[:, j] auc[j] = roc_auc_score(y_true, y_pred) mean_auc = auc.mean() xprint('%5d: auc=%.3f %s' % (i, mean_auc, label_score(auc))) describe(pred) show_best_worst(test_part, pred, n=3, do_best=False) if clf is not None: del clf return True, auc
def display_results(completed_tests, do_max, n_rank): n_completed = len(completed_tests) n_runs = min(len(v) for v in completed_tests.values()) auc = np.zeros((n_completed, n_runs), dtype=np.float64) l1 = set(completed_tests) l2 = {(clf_str) for clf_str in l1} assert len(l1) == len(l2), sorted(l1 - l2) clf_auc = {} for clf_str, runs in completed_tests.items(): clf_str = simplify(clf_str) n_runs = min(len(v) for v in runs) runs = runs[:n_runs] # print('runs=%d %s' % (len(runs), clf_str)) auc = np.zeros((n_runs, len(LABEL_COLS)), dtype=np.float64) for i, v in enumerate(runs): # print('v=%s' % v) auc[i, :] = np.array(v[1], dtype=np.float64) reduced_auc = auc.max(axis=0) if do_max else auc.mean(axis=0) duplicate = False if clf_auc: previous_auc = [v for _, v in clf_auc.values()] for p in previous_auc: d = reduced_auc - p if not np.abs(d).any() > 1e-6: duplicate = True break assert np.abs(d).any() > 1e-6, (reduced_auc, p, d) if duplicate: continue clf_auc[clf_str] = (n_runs, reduced_auc) best = defaultdict(list) for j, col in enumerate(LABEL_COLS + ['ALL']): xprint('#' * 100) method = 'MAX' if do_max else 'MEAN' xprint('RESULTS SUMMARY: %d - %d:%s %s %d' % (len(clf_auc), j, col, method, n_runs)) if col == 'ALL': clf_order = sorted(clf_auc, key=lambda k: -clf_auc[k][1].mean()) else: clf_order = sorted(clf_auc, key=lambda k: -clf_auc[k][1][j]) clf0 = clf_order[0] if col == 'ALL': best[clf0].append((col, clf_auc[clf0][1].mean())) else: best[clf0].append((col, clf_auc[clf0][1][j])) # q, p = [clf_auc[clf][1] for clf in clf_order[:2]] # d = q - p # assert d.any() > 1e-4, (q, p, clf_order[:2]) for i, clf in enumerate(clf_order[:n_rank]): n_runs, auc = clf_auc[clf] xprint('auc=%.4f %3d: %s %s' % (auc.mean(), i, auc, clf)) return best
def _init_config_(): """ Initialize for global variables in config module. """ try: echo = config.update_config_from_file(PATH_CONFIG, group='default') utils.xprint(echo, newline=True) if __debug__: echo = config.update_config_from_file(PATH_CONFIG, group='debug') utils.xprint(echo, newline=True) else: echo = config.update_config_from_file(PATH_CONFIG, group='run') utils.xprint(echo, newline=True) except: raise
def beam_search(list_list, beam_size=3, n=1): xprint('-' * 80) xprint('beam_search:') evaluator = Evaluator(n=n) scores = [] beam = [tuple()] params_auc = {} trial = 0 t0 = time.clock() for k, klist in enumerate(list_list): for bval in beam: for kval in klist: params = blend(bval, k, kval) if params in params_auc: continue if not valid_embedding_params(*params): continue print('###', len(params), params) ok, auc, desc = get_auc(evaluator, trial, params) if not ok: print('&&& Exception in classifier') continue print('^^^ trial=%d duration=%.1f sec' % (trial, time.clock() - t0)) score, col_scores = auc_score(auc) scores.append((score, col_scores, params, desc)) params_auc[params] = col_scores trial += 1 show_scores(scores) scores.sort(key=lambda x: (-x[0], x[2])) beam = [params for _, _, params, _ in scores[:beam_size]] show_scores(scores, force=True) xprint(k, '|' * 80)
def get_clf46(): return ClfSpacy(n_hidden=512, max_length=75, # Shape dropout=0.3, learn_rate=0.0005, # General NN config epochs=epochs, batch_size=300, frozen=frozen, lstm_type=lstm_type, predict_method=predict_method) clf_list = [get_clf45, get_clf40, get_clf43, get_clf44, get_clf46, get_clf41, get_clf42] lstm_list = [10, 9] frozen_list = [True] xprint_init('%s.%s' % (submission_name, get_n_samples_str()), False) auc_list = [] completed_tests = load_json(run_summary_path, {}) xprint('run_summary_path=%s' % run_summary_path) n_completed0 = len(completed_tests) for n_runs0 in range(3): print('n_completed0=%d n_runs0=%d' % (n_completed0, n_runs0)) for get_clf in clf_list: for lstm_type in lstm_list: for frozen in frozen_list: xprint('#' * 80) predict_method = PREDICT_METHODS_GOOD[0] clf_str = str(get_clf()) xprint(clf_str) runs = completed_tests.get(clf_str, []) if len(runs) > n_runs0: xprint('skipping runs=%d n_runs0=%d' % (len(runs), n_runs0)) continue
def post_run(): utils.csv_dump(resDataFrame, ftype='result') utils.xprint(os.linesep + 'Done!')
]) except: raise try: DumpPrediction.process_command_line_args( ["-e", "1", "-n", "'demo'", "-t", "'res/dump'"]) except ValueError, e: utils.xprint("""Exception correctly caught: "%s"...""" % e.message, newline=True) try: DumpPrediction.process_command_line_args([ "-f", "'[case]2018-01-18-20-53-10'", "-e", "invalid-epoch", "-n", "'demo'" ]) except ValueError, e: utils.xprint("""Exception correctly caught: "%s"...""" % e.message, newline=True) utils.xprint("Fine.", newline=True) utils.xprint("Test dumping ...", newline=True) try: DumpPrediction.dump_predictions('log/', 'log/[case]2018-01-18-20-53-10', 0, 'demo0', 'res/dump/demo') DumpPrediction.dump_predictions('log/', '[case]2018-01-18-20-53-10', 1, 'demo1', 'res/dump/demo') DumpPrediction.dump_predictions('log/', '2018-01-18-20-53-10', 2, 'demo2') except: raise
return ClfSpacy(n_hidden=512, max_length=75, # Shape dropout=0.5, learn_rate=0.005, # General NN config epochs=epochs, batch_size=150, frozen=frozen, lstm_type=lstm_type, predict_method=predict_method) clf_list = [get_clf22, get_clf23, get_clf24, get_clf25] lstm_list = [6, 7, 8, 9] frozen_list = [True] xprint_init('%s.%s' % (submission_name, get_n_samples_str()), False) auc_list = [] completed_tests = load_json(run_summary_path, {}) xprint('run_summary_path=%s' % run_summary_path) n_completed0 = len(completed_tests) for n_runs0 in range(3): print('n_completed0=%d n_runs0=%d' % (n_completed0, n_runs0)) for lstm_type in lstm_list: for get_clf in clf_list: for frozen in frozen_list: xprint('#' * 80) predict_method = PREDICT_METHODS_GOOD[0] clf_str = str(get_clf()) xprint(clf_str) runs = completed_tests.get(clf_str, []) if len(runs) > n_runs0: xprint('skipping runs=%d n_runs0=%d' % (len(runs), n_runs0)) continue
def dump_predictions(rootpath, frompath, epoch, dumpname, topath=None): """ Find predictions of given epoch in training & testing logs, and dump them to given topath. :param rootpath: :param frompath: Log folder / the identifier of an execution :param epoch: <int> epoch entry :param dumpname: Filename for dumped files e.g. node id :param topath: Destination path for dumping """ try: frompath = DumpPrediction.find_logpath(rootpath, frompath) frompath = utils.filer.format_subpath( frompath, subpath=utils.get_config('path_compare'), isfile=False) # utils.filer.create_path(topath) logname_train = 'train-epoch%d.log' % epoch logname_test = 'test-epoch%d.log' % epoch dumpname_train = '%s.train.trace' % dumpname dumpname_test = '%s.test.trace' % dumpname dumpname_full = '%s.full.trace' % dumpname dumpname_train = utils.filer.format_subpath(topath, dumpname_train) dumpname_test = utils.filer.format_subpath(topath, dumpname_test) dumpname_full = utils.filer.format_subpath(topath, dumpname_full) path_train = utils.filer.format_subpath(frompath, subpath=logname_train, isfile=True) pred_train = DumpPrediction.read_predictions_from_compare_file( path_train) if pred_train is not None: DumpPrediction.save_triples_to_file(pred_train, dumpname_train) utils.xprint( "Training predictions in '%s' are dumped to '%s'." % (path_train, dumpname_train), newline=True) else: utils.warn("Cannot find file '%s' (for training epoch %d)." % (path_train, epoch)) path_test = utils.filer.format_subpath(frompath, subpath=logname_test, isfile=True) pred_test = DumpPrediction.read_predictions_from_compare_file( path_test) if pred_test is not None: DumpPrediction.save_triples_to_file(pred_test, dumpname_test) utils.xprint( "Testing predictions in '%s' are dumped to '%s'." % (path_test, dumpname_test), newline=True) # Both are available if pred_train is not None: utils.filer.write(dumpname_full, utils.filer.read(dumpname_train)) utils.filer.write(dumpname_full, utils.filer.read(dumpname_test)) utils.xprint( "Full predictions in '%s' & '%s' are dumped to '%s'." % (path_train, path_test, dumpname_full), newline=True) # Both are unavailable elif pred_train is None: raise IOError("Cannot find file '%s' & '%s'." % (path_train, path_test)) else: utils.warn("Cannot find file '%s' (for testing epoch %d)." % (path_test, epoch)) except: raise
SpaCy deep_learning_keras.py solution to Kaggle Toxic Comment challenge """ from utils import xprint_init, xprint from framework import Evaluator, set_random_seed, make_submission, set_n_samples from clf_spacy import ClfSpacy submission_name = 'spacy_lstm10' do_submission = True epochs = 6 def get_clf(): return ClfSpacy(n_hidden=128, max_length=100, # Shape dropout=0.5, learn_rate=0.001, # General NN config epochs=epochs, batch_size=150, frozen=True, lstm_type=2) xprint_init(submission_name, do_submission) xprint('#' * 80) xprint(get_clf()) set_random_seed(seed=1234) if do_submission: make_submission(get_clf, submission_name) else: evaluator = Evaluator(n=3) ok, auc = evaluator.evaluate(get_clf) xprint('$' * 80)
def set_random_seed(seed): xprint('set_random_seed: seed=%d' % seed) _random_seed[0] = seed seed_random()