Esempio n. 1
0
File: main.py Progetto: mingsxs/lky
def pre_run():
    global tb, m, n, T, open_lim, loss_lim, g, U, M, dataFrame, values
    params = utils.cli_init_params()
    tb = params['tb']
    m = params['m']
    n = params['n']
    T = params['T']
    open_lim = params['open_lim']
    loss_lim = params['loss_lim']
    g = params['g']
    U = params['U']
    M = params['M']

    #tb = '2018-08-20'
    #M = 2
    #m = 3
    #n = 5
    #T = '2019-02-20'
    #open_lim = 1
    #loss_lim = 3
    #g = 0.2
    #U = 1

    dataFrame = utils.csv_open(DATA_CSV_FILE, tb=T, before=True, ftype='data')
    values = utils.csv_open(VALUE_CSV_FILE, tb=T, before=True, ftype='value')

    utils.xprint(os.linesep + 'Parameters initialized successfully!')
def build_lstm4(embeddings, shape, settings):
    model = Sequential()
    model.add(
        Embedding(
            embeddings.shape[0],
            embeddings.shape[1],
            input_length=shape['max_length'],
            trainable=False,
            weights=[embeddings],
            mask_zero=False,
            name='eembed'
        )
    )
    model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False, name='td4')))
    model.add(Bidirectional(LSTM(shape['n_hidden'], return_sequences=True,
                                 recurrent_dropout=settings['dropout'],
                                 dropout=settings['dropout'])))
    model.add(Flatten(name='flaaten'))
    model.add(BatchNormalization())
    n_dense = int(math.ceil(math.sqrt(shape['n_hidden'] * shape['n_class'])))
    model.add(Dense(n_dense, activation='relu'))
    # model.add(BatchNormalization())
    # x = Dropout(dropout)(x)
    model.add(Dense(shape['n_class'], activation='sigmoid'))
    xprint('build_lstm4: embeddings=%s shape=%s' % (dim(embeddings), shape))
    return model
def describe(y):
    """Return table of values
        min, mean, max
    """
    MEASURES = ['min', 'mean', 'max']
    stats = np.zeros((3, len(LABEL_COLS)), dtype=np.float64)
    xprint('stats=%s' % dim(stats))
    for j, col in enumerate(LABEL_COLS):
        stats[0, j] = y[:, j].min()
        stats[1, j] = y[:, j].mean()
        stats[2, j] = y[:, j].max()

    def draw(name, vals, sep='|'):
        vals = ['%12s' % v for v in ([name] + vals)]
        xprint((' %s ' % sep).join(vals))

    def draw_bar():
        bar = '-' * 12
        draw(bar, [bar] * len(LABEL_COLS), sep='+')

    draw_bar()
    draw('', LABEL_COLS)
    draw_bar()
    for i, measure in enumerate(MEASURES):
        draw(measure, ['%10.4f' % z for z in stats[i, :]])
    draw_bar()
Esempio n. 4
0
File: main.py Progetto: mingsxs/lky
def run():
    stk_pairs = strategy.filter_pairs(dataFrame, sequence=FILTER_SEQ, number=n)
    global resDataFrame
    resDataFrame = trade.trade(tb, m, open_lim, loss_lim, g, U, dataFrame,
                               values, stk_pairs, M)

    utils.xprint(os.linesep + 'Trade processed successfully!')
def build_lstm9(embeddings, shape, settings):
    """2 layer LSTM
    """
    model = Sequential()
    model.add(
        Embedding(
            embeddings.shape[0],
            embeddings.shape[1],
            input_length=shape['max_length'],
            trainable=False,
            weights=[embeddings],
            mask_zero=False
        )
    )
    model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False), name='td9a'))
    model.add(Bidirectional(LSTM(shape['n_hidden'], return_sequences=True,
                                 recurrent_dropout=settings['dropout'],
                                 dropout=settings['dropout']), name='bidi9a'))
    # model.add(GlobalMaxPool1D())
    # model.add(BatchNormalization())
    # model.add(Dropout(settings['dropout'] / 2.0))

    # model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False), name='td9b'))
    model.add(Bidirectional(LSTM(shape['n_hidden'], return_sequences=True,
                                 recurrent_dropout=settings['dropout'],
                                 dropout=settings['dropout']), name='bidi9b'))
    model.add(GlobalMaxPool1D(name='mp9'))
    model.add(BatchNormalization(name='bn9'))
    model.add(Dropout(settings['dropout'] / 2.0, name='drop9b'))

    model.add(Dense(shape['n_class'], activation='sigmoid', name='den9b'))
    xprint('build_lstm9: embeddings=%s shape=%s' % (dim(embeddings), shape))
    return model
Esempio n. 6
0
def filter_pairs(dataFrame, sequence=['coint', 'AR1', 'distance'], number=0):
    get_clsprc = lambda y: [x[0] for x in dataFrame[pair[y]]]
    stk_pairs = utils.get_stkcd_pairs(dataFrame)
    stk_pairs = [(pair[0], pair[1], {}) for pair in stk_pairs]
    utils.xprint(os.linesep + 'Originally, %d pairs are created...' %
                 (len(stk_pairs)))

    for pair in stk_pairs:
        pair[2]['bval'] = utils.Liner_Regression(get_clsprc(0), get_clsprc(1))

    #time.sleep(SLEEP_DURATION)
    container = globals()
    foo_name = None
    for item in sequence:
        for var in container.keys():
            if item in var: foo_name = var
        try:
            foo = container[foo_name]
        except KeyError, NameError:
            raise RuntimeError('Invalid filter method sequence')

        stk_pairs = foo(stk_pairs, dataFrame)
        utils.xprint('After %s, %d pairs left...' %
                     (foo.__doc__, len(stk_pairs)))
        time.sleep(SLEEP_DURATION)
    def fit(self, train, test_size=0.1):
        model_dir = get_model_dir(self.model_name, 0)
        # RocAucEvaluation saves the trainable part of the model
        model_path = os.path.join(model_dir, 'model')
        os.makedirs(model_dir, exist_ok=True)
        xprint('ClfCharLstm.fit: model_dir=%s' % model_dir)

        y_train = train[LABEL_COLS].values
        X_train = df_to_sentences(train)
        X_val, y_val = None, None
        if test_size > 0.0:
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size)

        lstm_shape = {'n_hidden': self.n_hidden,
                      'max_length': self.max_length,
                      'n_class': len(LABEL_COLS)}
        lstm_settings = {'dropout': self.dropout,
                         'lr': self.learn_rate}
        lstm, self.best_epochs = do_train(X_train, y_train, X_val, y_val, lstm_shape, lstm_settings, {},
                        epochs=self.epochs, batch_size=self.batch_size, frozen=self.frozen,
                        lstm_type=self.lstm_type, model_path=model_path)

        with open(os.path.join(model_dir, 'config.json'), 'wt') as f:
            f.write(lstm.to_json())

        print('****: best_epochs=%s - %s' % (self.best_epochs, self.description))
 def evaluate(self, get_clf):
     auc = np.zeros((self.n, len(LABEL_COLS)), dtype=np.float64)
     for i in range(self.n):
         ok, auc[i, :] = self._evaluate(get_clf, i)
         if not ok:
             return ok, auc
         show_auc(auc[:i + 1, :])
     xprint('program=%s train=%s' % (sys.argv[0], dim(self.train)))
     return True, auc
def do_train(train_texts, train_labels, dev_texts, dev_labels,
    lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, epochs=5, by_sentence=True,
    frozen=False, lstm_type=1, model_path=None):
    """Train a Keras model on the sentences in `train_texts`
        All the sentences in a text have the text's label
    """

    print('do_train: train_texts=%s dev_texts=%s' % (dim(train_texts), dim(dev_texts)))

    embeddings, char_index, _ = get_char_embeddings()

    n_train_sents = count_sentences(char_index, train_texts, batch_size, 'train')
    X_train, y_train = make_char_sentences(char_index, lstm_shape['max_length'], batch_size,
        train_texts, train_labels, 'train', n_train_sents)
    validation_data = None
    if dev_texts is not None:
        n_dev_sents = count_sentences(char_index, dev_texts, batch_size, 'dev')
        X_val, y_val = make_char_sentences(char_index, lstm_shape['max_length'], batch_size,
            dev_texts, dev_labels, 'dev', n_dev_sents)
        validation_data = (X_val, y_val)
    sentence_cache.flush()

    model = build_lstm[lstm_type](embeddings, lstm_shape, lstm_settings)
    compile_lstm(model, lstm_settings['lr'])

    callback_list = None
    if validation_data is not None:
        ra_val = RocAucEvaluation(validation_data=validation_data, interval=1, frozen=frozen,
            model_path=model_path)
        early = EarlyStopping(monitor='val_auc', mode='max', patience=1, verbose=1)
        callback_list = [ra_val, early]

    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
              validation_data=validation_data, callbacks=callback_list, verbose=1)
    best_epoch_frozen = ra_val.best_epoch
    ra_val.best_epoch = -1

    best_epoch_unfrozen = -1
    if not frozen:
        xprint("Unfreezing")
        for layer in model.layers:
            layer.trainable = True
        compile_lstm(model, lstm_settings['lr'] / 10)
        if validation_data is not None:
            # Reload the best model so far
            lstm_weights = [embeddings] + ra_val.top_weights
            model.set_weights(lstm_weights)
            # Reset early stopping
            early = EarlyStopping(monitor='val_auc', mode='max', patience=1, verbose=1)
            callback_list = [ra_val, early]
        model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
              validation_data=validation_data, callbacks=callback_list, verbose=1)
        best_epoch_unfrozen = ra_val.best_epoch

    return model, (best_epoch_frozen, best_epoch_unfrozen)
def split_data(df, indexes, frac):
    show_values('df', df)

    n = int(len(df) * frac)
    train = df.loc[indexes[:n]]
    test = df.loc[indexes[n:]]

    show_values('train', train)
    show_values('test', test)
    xprint('split_data: %.2f of %d: train=%d test=%d' % (frac, len(df), len(train), len(test)))
    return train, test
 def load(cls, path, char_index, max_length, frozen):
     xprint('SentimentAnalyser.load: path=%s max_length=%d' % (path, max_length))
     with open(os.path.join(path, 'config.json'), 'rt') as f:
         model = model_from_json(f.read())
     with open(os.path.join(path, 'model'), 'rb') as f:
         lstm_weights = pickle.load(f)
     if frozen:
         embeddings, char_index, index_char = get_char_embeddings()
         lstm_weights = [embeddings] + lstm_weights
     model.set_weights(lstm_weights)
     return cls(char_index, model, max_length=max_length)
def make_submission_reductions(get_clf, submission_name, predict_methods):
    seed_random()
    os.makedirs(SUBMISSION_DIR, exist_ok=True)

    train, test, subm = load_data()
    clf = get_clf()
    clf.fit(train, test_size=0.0)
    reductions = clf.predict_reductions(test, predict_methods)

    ok = True
    for method in predict_methods:
        submission_path = join(SUBMISSION_DIR, '%s.%s.%s.csv' % (
            submission_name, get_n_samples_str(), method))
        if os.path.exists(submission_path):
            xprint('make_submission_reductions: submission_path=%s already exists' % submission_path)
            ok = False
            break
        xprint('make_submission_reduction: method=%s' % method)
        pred = reductions[method]
        describe(pred)

        # Create the submission file.
        submid = pd.DataFrame({'id': subm['id']})
        submission = pd.concat([submid, pd.DataFrame(pred, columns=LABEL_COLS)], axis=1)
        submission.to_csv(submission_path, index=False)
        xprint('make_submission_reductions: Saved in %s' % submission_path)

        xprint('program=%s train=%s test=%s submission=%s' % (sys.argv[0], dim(train), dim(test),
            dim(submission)))

    if clf is not None:
        del clf
    return ok
def process_summary(path, n_rank):
    print('=' * 100)
    print('path=%s' % path)
    completed_tests = load_json(path)
    xprint('run_summary_path=%s' % path)
    best = {}
    try:
        best = display_results(completed_tests, do_max, n_rank)
        # display_results(completed_tests, True)
    except Exception as e:

        print('Bad summary: %s' % e)
    print('&' * 100)
    return best
def show_auc(auc):
    n = auc.shape[0]
    mean_auc = auc.mean(axis=0)
    auc_mean = auc.mean(axis=1)

    xprint('-' * 110, 'n=%d' % n)
    for i in range(n):
        xprint('%5d: auc=%.3f %s' % (i, auc[i, :].mean(), label_score(auc[i, :])))
    xprint('%5s: auc=%.3f %s' % ('Mean', mean_auc.mean(), label_score(mean_auc)))
    xprint('-' * 110)
    xprint('auc=%.3f +- %.3f (%.0f%%) range=%.3f (%.0f%%)' % (
         auc_mean.mean(), auc_mean.std(),
         100.0 * auc_mean.std() / auc_mean.mean(),
         auc_mean.max() - auc_mean.min(),
         100.0 * (auc_mean.max() - auc_mean.min()) / auc_mean.mean()
    ))
    def fit(self, train, test_size=0.1):
        print('ClfSpacy.fit', '-' * 80)
        (model1_path, config1_path), (model2_path, config2_path), epoch_path = self._get_paths(True)
        if not self.force_fit:
            if self.frozen:
                if (os.path.exists(model1_path) and os.path.exists(config1_path) and
                    SaveAllEpochs.epoch_dict(epoch_path)['epoch1'] == self.epochs):
                    xprint('model1_path already exists. re-using')
                    return
            else:
                if (os.path.exists(model2_path) and os.path.exists(config2_path) and
                    SaveAllEpochs.epoch_dict(epoch_path)['epoch2'] == self.epochs2):
                    xprint('model2_path already exists. re-using')
                    return
        do_fit1 = (not (os.path.exists(model1_path) and os.path.exists(config1_path)) or
                   SaveAllEpochs.epoch_dict(epoch_path)['epoch1'] < self.epochs)
        do_fit2 = (not self.frozen and (not (os.path.exists(model2_path) and
                                             os.path.exists(config2_path)) or
                   SaveAllEpochs.epoch_dict(epoch_path)['epoch2'] < self.epochs2))

        y_train = train[LABEL_COLS].values
        X_train = df_to_sentences(train)
        X_val, y_val = None, None
        if test_size > 0.0:
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size)

        lstm_shape = {'n_hidden': self.n_hidden,
                      'max_length': self.max_length,
                      'n_class': len(LABEL_COLS)}
        lstm_settings = {'dropout': self.dropout,
                         'lr': self.learn_rate}
        lstm, self.best_epochs = do_train(X_train, y_train, X_val, y_val, lstm_shape, lstm_settings,
            {}, batch_size=self.batch_size, lstm_type=self.lstm_type,
            do_fit1=do_fit1, epochs1=self.epochs, model1_path=model1_path, config1_path=config1_path,
            do_fit2=do_fit2, epochs2=self.epochs2, model2_path=model2_path, config2_path=config2_path,
            epoch_path=epoch_path)

        assert do_fit1
        if do_fit1:
            assert os.path.exists(model1_path), model1_path
            assert os.path.exists(config1_path), config1_path
        if do_fit2:
            assert os.path.exists(model2_path), model2_path
            assert os.path.exists(config2_path), config2_path

        print('****: best_epochs=%s - %s Add 1 to these' % (self.best_epochs, self.description))
        del lstm
 def evaluate_reductions(self, get_clf, predict_methods):
     predict_methods_all = predict_methods + ['BEST']
     auc_reductions = {method: np.zeros((self.n, len(LABEL_COLS)), dtype=np.float64)
                       for method in predict_methods_all}
     best_methods = []
     for i in range(self.n):
         ok, reductions, best = self._evaluate_reductions(get_clf, i, predict_methods)
         best_methods.append(best)
         if not ok:
             return ok, {}, best_methods
         for method in predict_methods_all:
             auc = auc_reductions[method]
             auci = reductions[method]
             auc[i, :] = auci
             print('evaluate_reductions: method=%s' % method)
             show_auc(auc[:i + 1, :])
     xprint('program=%s train=%s' % (sys.argv[0], dim(self.train)))
     return True, auc_reductions, best_methods
 def _get_paths(self, create_dir):
     model_dir = get_model_dir(self.model_name, 0)
     if create_dir:
         os.makedirs(model_dir, exist_ok=True)
     # RocAucEvaluation saves the trainable part of the model
     model1_path = os.path.join(model_dir, 'model')
     config1_path = os.path.join(model_dir, 'config.json')
     model2_path = os.path.join(model_dir, 'model2')
     config2_path = os.path.join(model_dir, 'config2.json')
     epoch_path = os.path.join(model_dir, 'epochs.json')
     if not self._shown_paths:
         xprint('model1_path=%s exists=%s' % (model1_path, os.path.exists(model1_path)))
         xprint('config1_path=%s exists=%s' % (config1_path, os.path.exists(config1_path)))
         xprint('model2_path=%s exists=%s' % (model2_path, os.path.exists(model2_path)))
         xprint('config2_path=%s exists=%s' % (config1_path, os.path.exists(config2_path)))
         xprint('epoch_path=%s exists=%s' % (epoch_path, os.path.exists(epoch_path)))
         self._shown_paths = True
     return (model1_path, config1_path), (model2_path, config2_path), epoch_path
def show_scores(scores, force=False):
    global scores_t0, scores_len

    if not force:
        if not scores or len(scores) == scores_len:
            return
        if time.clock() < scores_t0 + 60.0:
            return
    scores_t0 = time.clock()
    scores_len = len(scores)

    scores.sort(key=lambda x: (-x[0], x[2]))
    xprint('!' * 80)
    with open('all.results3.txt', 'wt') as f:
        for i, (score, col_scores, params, desc) in enumerate(scores):
            if i < 10:
                xprint('%4d: auc=%.3f %s %s %s' % (i, score, col_scores, params, desc))
            print('%4d: auc=%.3f %s %s %s' % (i, score, col_scores, params, desc), file=f)
Esempio n. 19
0
 def test():
     utils.xprint("Test arg processing ...", newline=True)
     try:
         DumpPrediction.process_command_line_args([
             "-f", "'[case]2018-01-18-20-53-10'", "-e", "1", "-n", "'demo'",
             "-t", "'res/dump/demo'"
         ])
         DumpPrediction.process_command_line_args([
             "-f", "'[case]2018-01-18-20-53-10'", "-e", "1", "-n", "'demo'"
         ])
     except:
         raise
     try:
         DumpPrediction.process_command_line_args(
             ["-e", "1", "-n", "'demo'", "-t", "'res/dump'"])
     except ValueError, e:
         utils.xprint("""Exception correctly caught: "%s"...""" % e.message,
                      newline=True)
def evaluate_params(evaluator, trial, n_hidden, dropout, max_features, learning_rate, maxlen, n_folds, embed_name,
    embed_size, n=1):

    def get_clf():
        return ClfLstmGlove(n_hidden=n_hidden, embed_name=embed_name, embed_size=embed_size, maxlen=maxlen,
            max_features=max_features, dropout=dropout, epochs=epochs, learning_rate=learning_rate,
            n_folds=1)

    xprint('#' * 80)
    xprint(get_clf())
    seed_random(seed=trial + 1000)

    xprint('evaluate_params(n_hidden=%d, dropout=%.3f, max_features=%d, learning_rate=%s' % (
        n_hidden, dropout, max_features, learning_rate))
    xprint(get_clf())

    ok, auc = evaluator.evaluate(get_clf)
    xprint('=' * 80)
    return ok, auc, str(get_clf())
def build_lstm1(embeddings, shape, settings):
    model = Sequential()
    model.add(
        Embedding(
            embeddings.shape[0],
            embeddings.shape[1],
            input_length=shape['max_length'],
            trainable=False,
            weights=[embeddings],
            mask_zero=True
        )
    )
    model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False)))
    model.add(Bidirectional(LSTM(shape['n_hidden'],
                                 recurrent_dropout=settings['dropout'],
                                 dropout=settings['dropout'])))
    model.add(Dense(shape['n_class'], activation='sigmoid'))
    xprint('build_lstm1: embeddings=%s shape=%s' % (dim(embeddings), shape))
    return model
    def predict_reductions(self, test, predict_methods):
        print('ClfSpacy.predict', '-' * 80)
        X_test = df_to_sentences(test)
        (model1_path, config1_path), (model2_path, config2_path), _ = self._get_paths(False)
        frozen = self.frozen
        if not frozen and not (os.path.exists(model2_path) and os.path.exists(config2_path)):
            xprint('unfrozen but no improvement over frozen. Using frozen')
            frozen = True

        if frozen:
            model_path, config_path = model1_path, config1_path
        else:
            model_path, config_path = model2_path, config2_path

        assert os.path.exists(model_path), model_path
        assert os.path.exists(config_path), config_path

        return predict_reductions(model_path, config_path, frozen, X_test, methods=predict_methods,
            max_length=self.max_length)
def load_data():
    train = pd.read_csv(join(TOXIC_DATA_DIR, 'train.csv'))
    test = pd.read_csv(join(TOXIC_DATA_DIR, 'test.csv'))
    subm = pd.read_csv(join(TOXIC_DATA_DIR, 'sample_submission.csv'))
    xprint('train,test,subm:', train.shape, test.shape, subm.shape)

    n_samples = get_n_samples()
    if n_samples > 0:
        train = train[:n_samples]
        test = test[:n_samples]

    seed_random()

    xprint('train=%d test=%d (%.1f%%)' % (len(train), len(test), 100.0 * len(test) / len(train)))

    # There are a few empty comments that we need to get rid of, otherwise sklearn will complain.
    train[COMMENT].fillna('_na_', inplace=True)
    test[COMMENT].fillna('_na_', inplace=True)

    return train, test, subm
def make_submission(get_clf, submission_name):
    seed_random()
    submission_path = join(SUBMISSION_DIR, '%s.%s.csv' % (submission_name, get_n_samples_str()))
    assert not os.path.exists(submission_path), submission_path
    os.makedirs(SUBMISSION_DIR, exist_ok=True)

    train, test, subm = load_data()
    clf = get_clf()
    clf.fit(train, test_size=0.0)
    pred = clf.predict(test)

    describe(pred)

    # Csreate the submission file.
    submid = pd.DataFrame({'id': subm['id']})
    submission = pd.concat([submid, pd.DataFrame(pred, columns=LABEL_COLS)], axis=1)
    submission.to_csv(submission_path, index=False)
    xprint('Saved in %s' % submission_path)
    xprint('program=%s train=%s test=%s submission=%s' % (sys.argv[0], dim(train), dim(test),
        dim(submission)))
def build_lstm2(embeddings, shape, settings):
    # inp = Input(shape=(shape['max_length'],))
    # x = Embedding(
    #         embeddings.shape[0],
    #         embeddings.shape[1],
    #         input_length=shape['max_length'],
    #         trainable=False,
    #         weights=[embeddings],
    #         mask_zero=True
    #     )(inp)
    # x = Bidirectional(LSTM(shape['n_hidden'],
    #                              recurrent_dropout=settings['dropout'],
    #                              dropout=settings['dropout']))(x)
    # x = GlobalMaxPool1D()(x)
    # x = BatchNormalization()(x)
    # x = Dense(50, activation="relu")(x)
    # #x = BatchNormalization()(x)
    # x = Dropout(dropout)(x)
    # x = Dense(shape['n_class'], activation='sigmoid')(x)
    # model = Model(inputs=inp, outputs=x)

    model = Sequential()
    model.add(
        Embedding(
            embeddings.shape[0],
            embeddings.shape[1],
            input_length=shape['max_length'],
            trainable=False,
            weights=[embeddings],
            mask_zero=False
        )
    )
    model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False), name='td2'))
    model.add(Bidirectional(LSTM(shape['n_hidden'], return_sequences=True,
                                 recurrent_dropout=settings['dropout'],
                                 dropout=settings['dropout'])))
    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dense(shape['n_class'], activation='sigmoid'))
    xprint('build_lstm2: embeddings=%s shape=%s' % (dim(embeddings), shape))
    return model
 def pipe(self, docs, batch_size=1000, n_threads=-1):
     interval = 10
     t0 = time.clock()
     i = 0
     k = 0
     for minibatch in cytoolz.partition_all(batch_size, docs):
         minibatch = list(minibatch)
         for doc in minibatch:
             Xs = get_features(doc.sents, self.max_length)
             ys = self._model.predict(Xs)
             if i >= interval:
                 xprint('SentimentAnalyser.pipe: %4d docs %5d sents %.1f sec' % (i, k, time.clock() - t0))
                 interval *= 2
             for method in self.methods:
                 y = reduce(ys, method=method)
                 assert len(y.shape) == 1 and len(y) == ys.shape[1], (ys.shape, y.shape)
                 doc.user_data[method] = y
             yield doc
             i += 1
             k += ys.shape[0]
     xprint('SentimentAnalyser.pipe: %4d docs %5d sents %.1f sec TOTAL' % (i, k, time.clock() - t0))
Esempio n. 27
0
    def dump_pan(frompath, nodename):
        try:
            filenames = []
            if nodename is not None:
                this = '%s.trace' % nodename
                filenames += [this]
            else:
                filenames = utils.filer.list_directory(frompath, '.*\.trace')

            if len(filenames) == 0:
                raise ValueError(
                    "No trace file is found under '%s' with node name '%s'." %
                    (frompath, nodename))

            for filename in filenames:
                nodename, _ = utils.filer.split_extension(filename)

                sampler = Sampler(path=frompath,
                                  nodes=[nodename],
                                  keep_positive=False)
                pan = sampler.pan_to_positive()

                panfile = '%s.pan' % nodename
                panfile = utils.filer.format_subpath(frompath,
                                                     panfile,
                                                     isfile=True)
                utils.filer.write(panfile,
                                  '%s\t%s' % (pan[0], pan[1]),
                                  mode='w')

                utils.xprint(
                    "Pan information (%s) for node %s is dumped to '%s'." %
                    (pan, nodename, panfile),
                    newline=True)

        except:
            raise
def build_lstm8(embeddings, shape, settings):
    """Flatten rather than pool"""
    model = Sequential()
    model.add(
        Embedding(
            embeddings.shape[0],
            embeddings.shape[1],
            input_length=shape['max_length'],
            trainable=False,
            weights=[embeddings],
            mask_zero=False,
            name='eembed'
        )
    )
    model.add(TimeDistributed(Dense(shape['n_hidden'], use_bias=False, name='td8')))
    model.add(Bidirectional(LSTM(shape['n_hidden'], return_sequences=True,
                                 recurrent_dropout=settings['dropout'],
                                 dropout=settings['dropout']), name='bidi'))
    model.add(Flatten(name='flaaten'))
    model.add(BatchNormalization())
    model.add(Dropout(settings['dropout'] / 2.0))
    model.add(Dense(shape['n_class'], activation='sigmoid'))
    xprint('build_lstm8: embeddings=%s shape=%s' % (dim(embeddings), shape))
    return model
    def _evaluate(self, get_clf, i, do_clips=False):
        xprint('_evaluate %3d of %d  %s' % (i, self.n, '-' * 66))
        assert 0 <= i < len(self.shuffled_indexes), (i, self.n, len(self.shuffled_indexes))
        train_part, test_part = split_data(self.train, self.shuffled_indexes[i], self.frac)

        CLIPS = [0.0, 1.0e-6, 1.0e-5, 1.0e-4, 1.0e-3, 1.0e-2, 0.1, 0.2, 0.3, 0.5, 0.8, 0.9]
        auc = np.zeros(len(LABEL_COLS), dtype=np.float64)

        clf = None
        try:
            clf = get_clf()
            t0 = time.clock()
            clf.fit(train_part)
            print('_evaluate %d fit duration=%.1f sec' % (i, time.clock() - t0))
            t0 = time.clock()
            pred = clf.predict(test_part)
            print('_evaluate %d predict duration=%.1f sec' % (i, time.clock() - t0))
            print('!!! _evaluate pred=%s' % dim(pred))
        except Exception as e:
            xprint('!!! _evaluate, exception=%s' % e)
            return False, auc

        if do_clips:
            for k, delta in enumerate(CLIPS):
                auc = np.zeros(len(LABEL_COLS), dtype=np.float64)
                for j, col in enumerate(LABEL_COLS):
                    y_true = test_part[col]
                    y_pred = np.clip(pred[:, j], 0.0, 1.0 - delta)
                    auc[j] = roc_auc_score(y_true, y_pred)
                mean_auc = auc.mean()
                xprint('%5d: %d: delta=%6g auc=%.5f %s' % (i, k, delta, mean_auc, label_score(auc)))

        auc = np.zeros(len(LABEL_COLS), dtype=np.float64)
        for j, col in enumerate(LABEL_COLS):
            y_true = test_part[col]
            y_pred = pred[:, j]
            auc[j] = roc_auc_score(y_true, y_pred)
        mean_auc = auc.mean()
        xprint('%5d: auc=%.3f %s' % (i, mean_auc, label_score(auc)))
        describe(pred)
        show_best_worst(test_part, pred, n=3, do_best=False)

        if clf is not None:
            del clf
        return True, auc
def display_results(completed_tests, do_max, n_rank):
    n_completed = len(completed_tests)
    n_runs = min(len(v) for v in completed_tests.values())
    auc = np.zeros((n_completed, n_runs), dtype=np.float64)

    l1 = set(completed_tests)
    l2 = {(clf_str) for clf_str in l1}
    assert len(l1) == len(l2), sorted(l1 - l2)

    clf_auc = {}
    for clf_str, runs in completed_tests.items():
        clf_str = simplify(clf_str)
        n_runs = min(len(v) for v in runs)
        runs = runs[:n_runs]
        # print('runs=%d %s' % (len(runs), clf_str))
        auc = np.zeros((n_runs, len(LABEL_COLS)), dtype=np.float64)
        for i, v in enumerate(runs):
            # print('v=%s' % v)
            auc[i, :] = np.array(v[1], dtype=np.float64)
            reduced_auc = auc.max(axis=0) if do_max else auc.mean(axis=0)
        duplicate = False
        if clf_auc:
            previous_auc = [v for _, v in clf_auc.values()]
            for p in previous_auc:
                d = reduced_auc - p
                if not np.abs(d).any() > 1e-6:
                    duplicate = True
                    break
                assert np.abs(d).any() > 1e-6, (reduced_auc, p, d)
        if duplicate:
            continue
        clf_auc[clf_str] = (n_runs, reduced_auc)

    best = defaultdict(list)
    for j, col in enumerate(LABEL_COLS + ['ALL']):
        xprint('#' * 100)
        method = 'MAX' if do_max else 'MEAN'
        xprint('RESULTS SUMMARY: %d - %d:%s %s %d' % (len(clf_auc), j, col, method, n_runs))
        if col == 'ALL':
            clf_order = sorted(clf_auc, key=lambda k: -clf_auc[k][1].mean())
        else:
            clf_order = sorted(clf_auc, key=lambda k: -clf_auc[k][1][j])
        clf0 = clf_order[0]
        if col == 'ALL':
            best[clf0].append((col, clf_auc[clf0][1].mean()))
        else:
            best[clf0].append((col, clf_auc[clf0][1][j]))
        # q, p = [clf_auc[clf][1] for clf in clf_order[:2]]
        # d = q - p
        # assert d.any() > 1e-4, (q, p, clf_order[:2])
        for i, clf in enumerate(clf_order[:n_rank]):
            n_runs, auc = clf_auc[clf]
            xprint('auc=%.4f %3d: %s %s' % (auc.mean(), i, auc, clf))

    return best
Esempio n. 31
0
def _init_config_():
    """
    Initialize for global variables in config module.
    """
    try:
        echo = config.update_config_from_file(PATH_CONFIG, group='default')
        utils.xprint(echo, newline=True)
        if __debug__:
            echo = config.update_config_from_file(PATH_CONFIG, group='debug')
            utils.xprint(echo, newline=True)
        else:
            echo = config.update_config_from_file(PATH_CONFIG, group='run')
            utils.xprint(echo, newline=True)

    except:
        raise
def beam_search(list_list, beam_size=3, n=1):
    xprint('-' * 80)
    xprint('beam_search:')

    evaluator = Evaluator(n=n)

    scores = []
    beam = [tuple()]
    params_auc = {}

    trial = 0
    t0 = time.clock()

    for k, klist in enumerate(list_list):
        for bval in beam:
            for kval in klist:
                params = blend(bval, k, kval)
                if params in params_auc:
                    continue
                if not valid_embedding_params(*params):
                    continue
                print('###', len(params), params)
                ok, auc, desc = get_auc(evaluator, trial, params)
                if not ok:
                    print('&&& Exception in classifier')
                    continue
                print('^^^ trial=%d duration=%.1f sec' % (trial, time.clock() - t0))
                score, col_scores = auc_score(auc)
                scores.append((score, col_scores, params, desc))
                params_auc[params] = col_scores
                trial += 1
                show_scores(scores)
        scores.sort(key=lambda x: (-x[0], x[2]))
        beam = [params for _, _, params, _ in scores[:beam_size]]
        show_scores(scores, force=True)
        xprint(k, '|' * 80)
def get_clf46():
    return ClfSpacy(n_hidden=512, max_length=75,  # Shape
                    dropout=0.3, learn_rate=0.0005,  # General NN config
                    epochs=epochs, batch_size=300, frozen=frozen,
                    lstm_type=lstm_type, predict_method=predict_method)


clf_list = [get_clf45, get_clf40, get_clf43, get_clf44, get_clf46, get_clf41, get_clf42]
lstm_list = [10, 9]
frozen_list = [True]

xprint_init('%s.%s' % (submission_name, get_n_samples_str()), False)
auc_list = []
completed_tests = load_json(run_summary_path, {})
xprint('run_summary_path=%s' % run_summary_path)
n_completed0 = len(completed_tests)

for n_runs0 in range(3):
    print('n_completed0=%d n_runs0=%d' % (n_completed0, n_runs0))
    for get_clf in clf_list:
        for lstm_type in lstm_list:
            for frozen in frozen_list:
                xprint('#' * 80)
                predict_method = PREDICT_METHODS_GOOD[0]
                clf_str = str(get_clf())
                xprint(clf_str)
                runs = completed_tests.get(clf_str, [])
                if len(runs) > n_runs0:
                    xprint('skipping runs=%d n_runs0=%d' % (len(runs), n_runs0))
                    continue
Esempio n. 34
0
File: main.py Progetto: mingsxs/lky
def post_run():
    utils.csv_dump(resDataFrame, ftype='result')

    utils.xprint(os.linesep + 'Done!')
Esempio n. 35
0
            ])
        except:
            raise
        try:
            DumpPrediction.process_command_line_args(
                ["-e", "1", "-n", "'demo'", "-t", "'res/dump'"])
        except ValueError, e:
            utils.xprint("""Exception correctly caught: "%s"...""" % e.message,
                         newline=True)
        try:
            DumpPrediction.process_command_line_args([
                "-f", "'[case]2018-01-18-20-53-10'", "-e", "invalid-epoch",
                "-n", "'demo'"
            ])
        except ValueError, e:
            utils.xprint("""Exception correctly caught: "%s"...""" % e.message,
                         newline=True)

        utils.xprint("Fine.", newline=True)

        utils.xprint("Test dumping ...", newline=True)
        try:
            DumpPrediction.dump_predictions('log/',
                                            'log/[case]2018-01-18-20-53-10', 0,
                                            'demo0', 'res/dump/demo')
            DumpPrediction.dump_predictions('log/',
                                            '[case]2018-01-18-20-53-10', 1,
                                            'demo1', 'res/dump/demo')
            DumpPrediction.dump_predictions('log/', '2018-01-18-20-53-10', 2,
                                            'demo2')
        except:
            raise
    return ClfSpacy(n_hidden=512, max_length=75,  # Shape
                    dropout=0.5, learn_rate=0.005,  # General NN config
                    epochs=epochs, batch_size=150, frozen=frozen,
                    lstm_type=lstm_type, predict_method=predict_method)




clf_list = [get_clf22, get_clf23, get_clf24, get_clf25]
lstm_list = [6, 7, 8, 9]
frozen_list = [True]

xprint_init('%s.%s' % (submission_name, get_n_samples_str()), False)
auc_list = []
completed_tests = load_json(run_summary_path, {})
xprint('run_summary_path=%s' % run_summary_path)
n_completed0 = len(completed_tests)

for n_runs0 in range(3):
    print('n_completed0=%d n_runs0=%d' % (n_completed0, n_runs0))
    for lstm_type in lstm_list:
        for get_clf in clf_list:
            for frozen in frozen_list:
                xprint('#' * 80)
                predict_method = PREDICT_METHODS_GOOD[0]
                clf_str = str(get_clf())
                xprint(clf_str)
                runs = completed_tests.get(clf_str, [])
                if len(runs) > n_runs0:
                    xprint('skipping runs=%d n_runs0=%d' % (len(runs), n_runs0))
                    continue
Esempio n. 37
0
    def dump_predictions(rootpath, frompath, epoch, dumpname, topath=None):
        """
        Find predictions of given epoch in training & testing logs, and dump them to given topath.
        :param rootpath:
        :param frompath: Log folder / the identifier of an execution
        :param epoch: <int> epoch entry
        :param dumpname: Filename for dumped files e.g. node id
        :param topath: Destination path for dumping
        """
        try:
            frompath = DumpPrediction.find_logpath(rootpath, frompath)
            frompath = utils.filer.format_subpath(
                frompath,
                subpath=utils.get_config('path_compare'),
                isfile=False)
            # utils.filer.create_path(topath)

            logname_train = 'train-epoch%d.log' % epoch
            logname_test = 'test-epoch%d.log' % epoch

            dumpname_train = '%s.train.trace' % dumpname
            dumpname_test = '%s.test.trace' % dumpname
            dumpname_full = '%s.full.trace' % dumpname

            dumpname_train = utils.filer.format_subpath(topath, dumpname_train)
            dumpname_test = utils.filer.format_subpath(topath, dumpname_test)
            dumpname_full = utils.filer.format_subpath(topath, dumpname_full)

            path_train = utils.filer.format_subpath(frompath,
                                                    subpath=logname_train,
                                                    isfile=True)
            pred_train = DumpPrediction.read_predictions_from_compare_file(
                path_train)
            if pred_train is not None:
                DumpPrediction.save_triples_to_file(pred_train, dumpname_train)
                utils.xprint(
                    "Training predictions in '%s' are dumped to '%s'." %
                    (path_train, dumpname_train),
                    newline=True)
            else:
                utils.warn("Cannot find file '%s' (for training epoch %d)." %
                           (path_train, epoch))

            path_test = utils.filer.format_subpath(frompath,
                                                   subpath=logname_test,
                                                   isfile=True)
            pred_test = DumpPrediction.read_predictions_from_compare_file(
                path_test)
            if pred_test is not None:
                DumpPrediction.save_triples_to_file(pred_test, dumpname_test)
                utils.xprint(
                    "Testing predictions in '%s' are dumped to '%s'." %
                    (path_test, dumpname_test),
                    newline=True)

                # Both are available
                if pred_train is not None:
                    utils.filer.write(dumpname_full,
                                      utils.filer.read(dumpname_train))
                    utils.filer.write(dumpname_full,
                                      utils.filer.read(dumpname_test))
                    utils.xprint(
                        "Full predictions in '%s' & '%s' are dumped to '%s'." %
                        (path_train, path_test, dumpname_full),
                        newline=True)

            # Both are unavailable
            elif pred_train is None:
                raise IOError("Cannot find file '%s' & '%s'." %
                              (path_train, path_test))

            else:
                utils.warn("Cannot find file '%s' (for testing epoch %d)." %
                           (path_test, epoch))

        except:
            raise
    SpaCy deep_learning_keras.py solution to Kaggle Toxic Comment challenge
"""
from utils import xprint_init, xprint
from framework import Evaluator, set_random_seed, make_submission, set_n_samples
from clf_spacy import ClfSpacy


submission_name = 'spacy_lstm10'
do_submission = True
epochs = 6


def get_clf():
    return ClfSpacy(n_hidden=128, max_length=100,  # Shape
                    dropout=0.5, learn_rate=0.001,  # General NN config
                    epochs=epochs, batch_size=150, frozen=True,
                    lstm_type=2)


xprint_init(submission_name, do_submission)
xprint('#' * 80)
xprint(get_clf())
set_random_seed(seed=1234)

if do_submission:
    make_submission(get_clf, submission_name)
else:
    evaluator = Evaluator(n=3)
    ok, auc = evaluator.evaluate(get_clf)
xprint('$' * 80)
def set_random_seed(seed):
    xprint('set_random_seed: seed=%d' % seed)
    _random_seed[0] = seed
    seed_random()