Esempio n. 1
0
def test_vgg16():

    ds = DataSet(data_path='data/',
                 shuffle=True,
                 balance=True,
                 categorical=True,
                 padd=True,
                 combine=3)

    r = VGG16_net()

    epochs_count = 10

    history, net, clf = train(ds.X_train,
                              ds.y_train,
                              ds.X_test,
                              ds.y_test,
                              r,
                              epochs=epochs_count)

    #plot_training(history, net, epochs_count)

    if predict(ds.X_test, ds.y_test, net):
        full_ds = DataSet(data_path='data/', padd=True, combine=3)

        #submission(net, None, full_ds, network=True, trained=True)
        submmiss_clf(net, clf, full_ds)
Esempio n. 2
0
def train_test_model(log_dir, hparams: dict):
    dataset = DataSet(fraction=1.0)
    optimiser = getattr(tf.keras.optimizers, hparams['optimizer'])
    schedule = scheduler(hparams, dataset)

    model = SequentialCNN(input_shape=dataset.input_shape(),
                          output_shape=dataset.output_shape())

    model.compile(loss=tf.keras.losses.categorical_crossentropy,
                  optimizer=optimiser(learning_rate=hparams['learning_rate']),
                  metrics=['accuracy'])

    history = model.fit(
        dataset.data['train_X'],
        dataset.data['train_Y'],
        batch_size=hparams["batch_size"],
        epochs=250,
        verbose=False,
        validation_data=(dataset.data["valid_X"], dataset.data["valid_Y"]),
        callbacks=[
            EarlyStopping(monitor='val_loss',
                          mode='min',
                          verbose=1,
                          patience=hparams['patience']),
            schedule,
            tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                           histogram_freq=1),  # log metrics
            hp.KerasCallback(log_dir, hparams)  # log hparams
        ])
    print({key: value[-1] for key, value in history.history.items()})
Esempio n. 3
0
def main():
    ds = DataSet()
    with open(INPUT_DATA_FILE, "r", newline='', encoding="utf8") as csv_file:
        ds.extract_from_csv(csv_file)

    # X_train, X_test, y_train, y_test = train_test_split(ds.X, ds.y, test_size=0.3, random_state=1)
    clf = MLPClassifier(algorithm='l-bfgs', max_iter=50, alpha=1e-6, hidden_layer_sizes=10000, random_state=1)
    # classifier = clf.fit(X_train, y_train)
    print(cross_val_score(clf, ds.X, ds.y, cv=10, n_jobs=-1))
Esempio n. 4
0
def execute_online_pipeline(
        input_file, system_name, features,
        training_start_date, validation_start_date, test_start_date):
    """Execute one independent run (the data is loaded again)."""
    utils.collect_garbage()

    time_label = (
        str(validation_start_date.date()) + ' to ' +
        str(test_start_date.date())
    )

    queuelogger.set_context(time_label, system_name)

    data = loading.load_df(input_file, featurelist.get_columns(features))

    # The revision ids computed here slightly differ from the values in the
    # file constants.py. However, both computations result in exactly the same
    # training and validation set. The reason for different revision ids is
    # that the corpus does not contain bot revisions while the revision ids in
    # the constants file include bot revisions.
    training_start_index = DataSet.get_index_for_date_from_df(data, training_start_date)
    validation_start_index = DataSet.get_index_for_date_from_df(data, validation_start_date)
    test_start_index = DataSet.get_index_for_date_from_df(data, test_start_date)

#     _logger.debug('Training start revisionId: %s' % str(data.loc[training_start_index]['revisionId']))
#     _logger.debug(data.loc[training_start_index-5:training_start_index+5,['revisionId', 'timestamp']])
#     _logger.debug('Validation start revisionId: %s' % str(data.loc[validation_start_index]['revisionId']))
#     _logger.debug(data.loc[validation_start_index-5:validation_start_index+5,['revisionId', 'timestamp']])
#     _logger.debug('Test start revisionId: %s' % str(data.loc[test_start_index]['revisionId']))
#     _logger.debug(data.loc[test_start_index-5:test_start_index+5,['revisionId', 'timestamp']])

    data = data[0: test_start_index]  # preprocessing transformation does not have to be applied to the whole data set
    fit_slice = slice(0, validation_start_index)

    data = preprocessing.fit_transform(
        time_label, system_name, data, features, fit_slice)

    training = data[training_start_index:validation_start_index]
    validation = data[validation_start_index:test_start_index]

    if validation.get_system_name() == 'WDVD':
        metrics = classification.bagging_and_multiple_instance(
            training, validation, print_results=False)
    else:
        metrics = classification.default_random_forest(
            training, validation, print_results=False)

    metrics = metrics.reorder_levels(['Dataset', 'System', 'Classifier'])

    metrics[('ALL', 'VANDALISM_FRACTION')] = validation.get_vandalism_fraction()

    _print_metrics(metrics)
Esempio n. 5
0
    def setUp(self):
        self.init_column_names = ["feat_001", "X-coord", "h,std,dev", "Grade"]
        self.init_features = [[0., 0.1, 3.], [0., 0.2, 0.], [0., 0.3, 0.5]]
        self.init_classes = ["G3", "G1", "G1"]

        self.expected_extracted_column_names = self.init_column_names
        self.expected_extracted_features = [[1.7, 3., 0.09], [-5., -1.12, 0.]]
        self.expected_extracted_classes = ["G2", "G3"]

        self.data_set_dir = path.join(path.dirname(__file__), "data_sets")

        self.data_set = DataSet(X=self.init_features,
                                y=self.init_classes,
                                col_names=self.init_column_names)
Esempio n. 6
0
def process_data(method, path='data/'):

    print('Starting the process')

    ds = DataSet(method, full=True)

    folder_path = path + str(method.__name__)

    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    print(ds.X_train.shape)
    print(ds.X_test.shape)
    print(ds.y_train.shape)

    np.save(os.path.join(folder_path, 'X_train.npy'), ds.X_train)
    np.save(os.path.join(folder_path, 'X_test.npy'), ds.X_test)

    y_train = ds.mapper.inverse_transform(ds.y_train)

    with open(os.path.join(folder_path, 'y_train.csv'), 'w') as file:
        file.write('id,scene_label\n')

        for n, label in enumerate(y_train):
            file.write('{:d},{:s}\n'.format(n, label))

    print('\nFiles have been saved into: {}'.format(folder_path))
Esempio n. 7
0
    def get_models_data(self):

        interval = np.linspace(1, .1, num=self.__interval_n, dtype=np.float)

        container = dict()
        for model in self.__models:
            container[str(model)] = ModelData(model, self.__test_n)

        for method in self.__methods:
            for inter in interval:
                train_size = round(inter * 0.8, 3)
                test_size = round(inter * 0.2, 3)

                ds = DataSet(method,
                             shuffle_data=True,
                             test_size=test_size,
                             train_size=train_size)
                results = evaluate(deepcopy(models),
                                   ds,
                                   n=self.__test_n,
                                   debug=False)

                for result in results:
                    container[result] += {method.__name__: results[result]}

        return container
Esempio n. 8
0
def scheduler(hparams: dict, dataset: DataSet):
    if hparams['scheduler'] is 'constant':
        return LearningRateScheduler(lambda epocs: hparams['learning_rate'],
                                     verbose=False)

    if hparams['scheduler'] is 'linear_decay':
        return LearningRateScheduler(
            lambda epocs: max(hparams['learning_rate'] *
                              (10. / (10. + epocs)), min_lr),
            verbose=False)

    if hparams['scheduler'].startswith('CyclicLR')\
    or hparams['scheduler'] in ["triangular", "triangular2", "exp_range"]:
        # DOCS: https://www.datacamp.com/community/tutorials/cyclical-learning-neural-nets
        # CyclicLR_triangular, CyclicLR_triangular2, CyclicLR_exp_range
        mode = re.sub(r'^CyclicLR_', '', hparams['scheduler'])

        # step_size should be epoc multiple between 2 and 8, but multiple of 2 (= full up/down cycle)
        if hparams['patience'] <= 6:
            whole_cycles = 1  #  1/2   = 0.5  | 6/2    = 3
        elif hparams['patience'] <= 12:
            whole_cycles = 2  #  8/4   = 2    | 12/4   = 3
        elif hparams['patience'] <= 24:
            whole_cycles = 3  # 14/6   = 2.3  | 24/6   = 4
        elif hparams['patience'] <= 36:
            whole_cycles = 4  # 26/8   = 3.25 | 36/8   = 4.5
        elif hparams['patience'] <= 48:
            whole_cycles = 5  # 28/10  = 2.8  | 48/10  = 4.8
        elif hparams['patience'] <= 72:
            whole_cycles = 6  # 50/12  = 4.2  | 72/12  = 6
        elif hparams['patience'] <= 96:
            whole_cycles = 8  # 74/16  = 4.6  | 96/16  = 6
        else:
            whole_cycles = 12  # 100/24 = 4.2  | 192/24 = 8

        return CyclicLR(mode=mode,
                        step_size=dataset.epoc_size() * (hparams['patience'] /
                                                         (2.0 * whole_cycles)),
                        base_lr=min_lr(hparams),
                        max_lr=hparams['learning_rate'])

    if hparams['scheduler'].startswith('plateau'):
        factor = int((re.findall(r'\d+', hparams['scheduler']) +
                      [10])[0])  # plateau2      || plateau10 (default)
        if 'sqrt' in hparams['scheduler']:
            patience = math.sqrt(
                hparams['patience'])  # plateau2_sqrt || plateau10__sqrt
        else:
            patience = hparams['patience'] / 2.0

        return ReduceLROnPlateau(
            monitor='val_loss',
            factor=1 / factor,
            patience=math.floor(patience),
            # min_lr   = min_lr(hparams),
            verbose=False,
        )

    print("Unknown scheduler: ", hparams)
def submission(model, method):
    """Trains the model with the full data set and saves the predicted labels to a submission file."""

    ds = DataSet(method, full=True)

    model.fit(ds.X_train, ds.y_train)
    y_pred = model.predict(ds.X_test)
    save_submission(str(model), ds.mapper.inverse_transform(y_pred))
def get_splitting_indices(data, use_test_set):
    training_set_start = constants.TRAINING_SET_START

    if use_test_set:
        validation_set_start = constants.TEST_SET_START
        test_set_start = constants.TAIL_SET_START
    else:
        validation_set_start = constants.VALIDATION_SET_START,
        test_set_start = constants.TEST_SET_START

    # transform revision id to index in data set
    training_set_start = DataSet.get_index_for_revision_id_from_df(
        data, training_set_start)
    validation_set_start = DataSet.get_index_for_revision_id_from_df(
        data, validation_set_start)
    test_set_start = DataSet.get_index_for_revision_id_from_df(
        data, test_set_start)

    return training_set_start, validation_set_start, test_set_start
Esempio n. 11
0
def main():
    ds = DataSet()
    with open(INPUT_DATA_FILE, "r", newline='', encoding="utf8") as csv_file:
        ds.extract_from_csv(csv_file)

    print("Ranking (descending)", ds.create_features_ranking(use_names=True))

    experiment_results = {}
    final_counter = Counter()

    for layer_size in HIDDEN_LAYER_SIZES:
        experiment_results[layer_size] = {}
        for n_features in range(1, ds.number_of_features, 1):
            result = run_experiment(ds.X, ds.y, hidden_layer_size=layer_size, n_features=n_features)
            experiment_results[layer_size][n_features] = result
            final_counter.update(result.counter)
            print_result(result, layer_size, n_features)

    print("\nNum of times features were selected: {}".format(final_counter))

    generate_plots(experiment_results, ds.number_of_features, ds.col_names, final_counter)
Esempio n. 12
0
def test_rnn():

    ds = DataSet(features.mfcc_spec,
                 data_path='data/',
                 shuffle=True,
                 balance=True,
                 categorical=True)

    r = RNN1()

    epochs_count = 1

    history, net, clf = train(ds.X_train,
                              ds.y_train,
                              ds.X_test,
                              ds.y_test,
                              r,
                              epochs=epochs_count)

    plot_training(history, net, epochs_count)

    if predict(ds.X_test, ds.y_test, net):
        ds_sub = DataSet(full=True)
        submission(net, None, ds_sub, network=True, trained=True)
def build_dataset(df, y):
    _logger.debug('building dataset...')

    _logger.debug('slicing...')

    _logger.debug('meta...')
    n_meta = len(featurelist.get_meta_list())
    new_meta = df.iloc[:, 0:n_meta]

    _logger.debug('X...')
    new_X = df.iloc[:, n_meta:].values  # takes a looong time
    features = df.columns[n_meta:]

    _logger.debug('y...')
    new_Y = y.values

    utils.collect_garbage()

    _logger.debug('dataset...')
    new_data = DataSet()

    _logger.debug('set_meta...')
    new_data.set_meta(new_meta)

    _logger.debug('set_X...')
    new_data.set_X(new_X)

    _logger.debug('set_Y...')
    new_data.set_Y(new_Y)

    _logger.debug('set_features...')
    new_data.set_features(features)

    _logger.debug('building dataset...done.')

    return new_data
Esempio n. 14
0
    def compute_data_frame(data):
        _logger.debug("Splitting statistics...")
        training_set_start_index = 0  # compute statistics from start of dataset
        validation_set_start_index = \
            DataSet.get_index_for_revision_id_from_df(data, constants.VALIDATION_SET_START)
        test_set_start_index = \
            DataSet.get_index_for_revision_id_from_df(data, constants.TEST_SET_START)
        tail_set_start_index = \
            DataSet.get_index_for_revision_id_from_df(data, constants.TAIL_SET_START)

        training_set = data[
            training_set_start_index:validation_set_start_index]
        validation_set = data[validation_set_start_index:test_set_start_index]
        test_set = data[test_set_start_index:tail_set_start_index]

        result = []
        result.append(
            compute_splitting_statistics_row(training_set, 'Training'))
        result.append(
            compute_splitting_statistics_row(validation_set, 'Validation'))
        result.append(compute_splitting_statistics_row(test_set, 'Test'))

        result = pd.concat(result, axis=0)
        return result
Esempio n. 15
0
 def test_should_raise_error_on_feature_size_mismatch(self):
     with self.assertRaises(RuntimeError) as cm:
         DataSet(X=[[1], [1, 2]])
     with self.assertRaises(RuntimeError):
         DataSet(y=[1])
     with self.assertRaises(RuntimeError):
         DataSet(X=[[1], [1, 2]], y=[1])
     with self.assertRaises(RuntimeError):
         DataSet(X=[[1], [1, 2]], y=[1, 1])
     with self.assertRaises(RuntimeError):
         DataSet(X=[[1], [1, 2]], y=[1, 1, 1])
     with self.assertRaises(RuntimeError):
         DataSet(col_names=["a"])
     with self.assertRaises(RuntimeError):
         DataSet(X=[[1, 2], [1, 2]], y=[1, 1], col_names=["a"])
     with open(path.join(DATA_SETS_DIR, "mock_data_set_corrupted.csv"),
               "r",
               newline='',
               encoding="utf8") as csv_file:
         with self.assertRaises(RuntimeError):
             self.data_set.extract_from_csv(csv_file)
                print('{:s}: {:.6f}'.format(str(model), score))

        # Take the average over all the measurements
        mean_meta = dict()
        for key in meta:
            mean_meta[key] = np.mean(meta[key])

        results[str(model)] = mean_meta

    return results


if __name__ == '__main__':

    # Create the dataset
    ds = DataSet(features.mean_over_time, shuffle=True)

    # Add/Remove tested models here.
    models = [
        SVM_model(),
        SVM_model('linear'),
        SVM_model('poly'),
        LR_model(),
        KNN_model(),
        RFC_model(),
        LDA_model()
    ]

    print(evaluate(models, ds, debug=True))

    # add different path for data files:
Esempio n. 17
0
from src.utils import argparser, logging

logfile = './logs/parallax-tf/logs.txt'
try:
    os.makedirs(os.path.dirname(logfile))
except OSError as exc:
    if exc.errno == errno.EEXIST and os.path.isdir(os.path.dirname(logfile)):
        pass
    else:
        raise

FLAGS = argparser()
FLAGS.is_training = False

train_dataset = DataSet(fpath=FLAGS.train_file,
                        seqlen=FLAGS.seq_len,
                        n_classes=FLAGS.num_classes,
                        need_shuffle=False)
test_dataset = DataSet(fpath=FLAGS.test_file,
                       seqlen=FLAGS.seq_len,
                       n_classes=FLAGS.num_classes,
                       need_shuffle=False)
FLAGS.charset_size = train_dataset.charset_size
FLAGS.sync = True

resource_info = os.path.abspath(
    os.path.join(os.path.dirname(__file__), '.', FLAGS.resource_info_file))

single_graph = tf.Graph()

with single_graph.as_default():
    ops, global_step = get_placeholders(FLAGS)
Esempio n. 18
0
def main(files):
    utils.print_system_info()
    utils.init_pandas()

    _logger.info("FILES=" + str(files))

    # Load feature file for some statistics
    features = featurelist.get_meta_list() + featurelist.get_label_list()
    df = loading.load_df(files, featurelist.get_columns(features))
    test_set_start = DataSet.get_index_for_revision_id_from_df(
        df, constants.TEST_SET_START)
    tail_set_start = DataSet.get_index_for_revision_id_from_df(
        df, constants.TAIL_SET_START)
    df = df[test_set_start:tail_set_start]
    data = DataSet()
    data.set_meta(df.iloc[:, :-1])
    data.set_Y(df.iloc[:, -1].astype(np.float32))
    data.set_X(np.zeros((len(data), 1)))
    _logger.debug("Length of data: " + str(len(data)))

    # Load scores
    scores = pd.DataFrame()
    scores[REVISION_ID] = data.get_revision_ids()
    scores.set_index(REVISION_ID, inplace=True)

    for team, score_file in files['teams'].items():
        team_scores = load_vandalism_scores(score_file)
        team_scores.set_index(REVISION_ID, inplace=True)
        scores[team] = team_scores[VANDALISM_SCORE]

    scores.dropna(inplace=True)
    if len(data) != len(scores):
        raise Exception(
            "number of scores does not fit test set size: " +
            "len(data)={0} but len(scores)={1}".format(len(data), len(scores)))

    _logger.debug("Length of scores: " + str(len(data)))

    # Evaluate teams
    meta_scores = compute_meta_scores(scores)
    scores = pd.concat([scores, meta_scores], axis=1)

    evaluate_teams(scores, data, save_scores=['META'])
    evaluate_teams_over_time(scores, data, EVALUATION_OVER_TIME_SUFFIX)

    scores, data = clean_data(scores, data)
    evaluate_teams(scores, data, suffix=EVALUATION_RESULTS_CLEANED_SUFFIX)
Esempio n. 19
0
from src.dataset import DataSet

if __name__ == '__main__':
  FLAGS = argparser()
  FLAGS.is_training = False
  logfile = "./logs/tensorflow/log.txt"
  try :
    os.makedirs(os.path.dirname(logfile))
  except OSError as exc:
    if exc.errno == errno.EEXIST and os.path.isdir(os.path.dirname(logfile)):
      pass
    else : 
      raise

  train_dataset = DataSet( fpath=FLAGS.train_file,
                           seqlen=FLAGS.seq_len,
                           n_classes=FLAGS.num_classes,
                           need_shuffle=True )
  test_dataset = DataSet( fpath=FLAGS.test_file,
                          seqlen=FLAGS.seq_len,
                          n_classes=FLAGS.num_classes,
                          need_shuffle=True )
  FLAGS.charset_size = train_dataset.charset_size

  ops, global_step = get_placeholders( FLAGS )
    
  seq = ops['data']
  label = ops['labels']
  logits, _ = inference(seq, FLAGS)

  tf.losses.softmax_cross_entropy(label, logits)
  loss = tf.losses.get_total_loss()
Esempio n. 20
0
            )
        except KeyError:
            print('False kernel')
        self.name = 'SVM-{:s}'.format(kernel)

    def __str__(self):
        """Returns the name of the model"""
        return self.name


if __name__ == '__main__':

    import sys

    sys.path.append('.')

    from param_test import ParamTester

    from src.dataset import DataSet
    import src.feature_extraction as fe

    ds = DataSet(method=fe.mfcc_spec, shuffle=True)

    test = ParamTester(ds,
                       SVM_model('poly'),
                       iter_method={'C': 100},
                       debug=True)
    test.run()
    test.save_results()
    test.plot()
Esempio n. 21
0
        "Adagrad",  # Best with LR=0.1 + triangular (slow/best) or plateau2 (quick)
        "SGD",  # Best with LR=1   + triangular2

        ### learning_rate vs optimizer + scheduler=constant | needs learning_rate=0.1 | random until 16 epocs, then quickly converges
        "Ftrl",  # Only works with: LR=0.1 + plateau2/constant OR LR=1 + CyclicLR_triangular
    ]),
    "scheduler":
    hp.Discrete([
        # 'constant',
        # 'linear_decay',
        'plateau2',
        'plateau2_sqrt',
        'plateau10',
        'plateau10_sqrt',
        'CyclicLR_triangular',
        'CyclicLR_triangular2',
        'CyclicLR_exp_range'
    ]),
}

if __name__ == "__main__":
    dataset = DataSet(fraction=1.0)
    model = SequentialCNN(input_shape=dataset.input_shape(),
                          output_shape=dataset.output_shape())
    log_dir = "../../../logs/convergence_search"
    stats_history = hparam_search.hparam_search(hparam_options,
                                                model,
                                                dataset,
                                                log_root=log_dir,
                                                verbose=argv.verbose)
Esempio n. 22
0
def _compute_backpressure_statistics(data):
    # Restrict computation to test dataset
    test_set_start_index = \
        DataSet.get_index_for_revision_id_from_df(data, constants.TEST_SET_START)
    tail_set_start_index = \
        DataSet.get_index_for_revision_id_from_df(data, constants.TAIL_SET_START)
    data = data[test_set_start_index:tail_set_start_index]

    data = data[[
        REVISION_ID, ITEM_ID, USER_NAME, REVISION_ACTION, ROLLBACK_REVERTED
    ]]

    REVISION_ID_INDEX = 0  # noqa
    ITEM_ID_INDEX = 1
    USER_NAME_INDEX = 2
    REVISION_ACTION_INDEX = 3
    ROLLBACK_REVERTED_INDEX = 4  # noqa

    data = data.values

    result = np.full(len(data), np.nan)
    revealed = pd.DataFrame()

    for i in range(len(data)):
        user_name = data[i][USER_NAME_INDEX]
        item_id = data[i][ITEM_ID_INDEX]

        prev_rev = data[i]

        for j in range(i + 1, min(len(data), i + 16)):
            rev = data[j]

            if rev[ITEM_ID_INDEX] == item_id:
                # Rollback within same session (same item id and same user name)
                if rev[USER_NAME_INDEX] == user_name:
                    if rev[REVISION_ACTION_INDEX] == 'rollback':
                        result[i] = True
                        revealed = revealed.append(pd.Series(prev_rev),
                                                   ignore_index=True)
                        break
                # Rollback at beginning of next session
                else:
                    if rev[REVISION_ACTION_INDEX] == 'rollback':
                        result[i] = True
                        revealed = revealed.append(pd.Series(prev_rev),
                                                   ignore_index=True)
                        break
                    else:
                        result[i] = False
                        revealed = revealed.append(pd.Series(prev_rev),
                                                   ignore_index=True)
                        break

    n_revisions = result.size
    n_revealed_total = (~(np.isnan(result))).sum()
    n_revealed_regular = (result == True).sum()  # noqa
    n_revealed_vandalism = (result == False).sum()  # noqa

    _logger.info('n_revisions: ' + str(n_revisions))
    _logger.info('n_revealed_total: ' + str(n_revealed_total))
    _logger.info('n_revealed_vandalism: ' + str(n_revealed_vandalism))
    _logger.info('n_revealed_regular: ' + str(n_revealed_regular))
def omit_holdout_df(df):
    """Omit the holdout dataframe."""
    tail_set_start_index = \
        DataSet.get_index_for_revision_id_from_df(df, constants.TAIL_SET_START)
    df = df[:tail_set_start_index]
    return df
Esempio n. 24
0
                     priors=None,
                     n_components=None,
                     store_covariance=False,
                     tol=1.0e-4)
        self.name = 'LDA'

    def __str__(self):
        return self.name


if __name__ == '__main__':

    import sys

    sys.path.append('.')

    from param_test import ParamTester
    from src.dataset import DataSet
    from src.feature_extraction import mean_over_time

    ds = DataSet(method=mean_over_time)

    solver_list = ['lsqr', 'eigen']

    test = ParamTester(ds,
                       LDA_model(),
                       iter_method={'shrinkage': 10},
                       debug=True)
    test.run()
    test.save_results()
    #test.plot()
Esempio n. 25
0
    def __init__(self, opts):
        self.dir = opts.dir
        self.report_every_steps = opts.train['report_every_steps']
        self.validation_every_steps = opts.train['validation_every_steps']
        self.checkpoint_every_steps = opts.train['checkpoint_every_steps']
        self.train_steps = opts.train['train_steps']
        self.vocab = Vocab(opts.cfg['vocab'])
        self.cuda = opts.cfg['cuda']
        self.n_steps_so_far = 0
        self.average_last_n = opts.train['average_last_n']
        self.steps = opts.train['steps']
        V = len(self.vocab)
        N = opts.cfg['num_layers']
        d_model = opts.cfg['hidden_size']
        d_ff = opts.cfg['feedforward_size']
        h = opts.cfg['num_heads']
        dropout = opts.cfg['dropout']
        factor = opts.cfg['factor']
        label_smoothing = opts.cfg['label_smoothing']
        warmup_steps = opts.cfg['warmup_steps']
        lrate = opts.cfg['learning_rate']
        beta1 = opts.cfg['beta1']
        beta2 = opts.cfg['beta2']
        eps = opts.cfg['eps']
        batch_size = opts.train['batch_size']
        max_length = opts.train['max_length']
        swap_bitext = opts.train['swap_bitext']
        self.sim_run = self.steps['sim']['run']
        p_uneven = self.steps['sim']['p_uneven']
        sim_pooling = self.steps['sim']['pooling']
        R = self.steps['sim']['R']
        align_scale = self.steps['sim']['align_scale']
        self.p_mask = self.steps['mlm']['p_mask']
        self.r_same = self.steps['mlm']['r_same']
        self.r_rand = self.steps['mlm']['r_rand']
        if 1.0 - self.r_same - self.r_rand <= 0.0:
            logging.error('r_mask={} <= zero'.format(1.0 - self.r_same -
                                                     self.r_rand))
            sys.exit()

        self.model = make_model(V,
                                N=N,
                                d_model=d_model,
                                d_ff=d_ff,
                                h=h,
                                dropout=dropout)
        if self.cuda:
            self.model.cuda()

        self.optimizer = NoamOpt(
            d_model, factor, warmup_steps,
            torch.optim.Adam(self.model.parameters(),
                             lr=lrate,
                             betas=(beta1, beta2),
                             eps=eps))

        if self.steps['sim']['run']:
            if self.steps['sim']['pooling'] == 'align':
                self.criterion = AlignSIM()
            else:
                self.criterion = CosineSIM()
        else:
            #self.criterion = CrossEntropy(padding_idx=self.vocab.idx_pad)
            self.criterion = LabelSmoothing(size=V,
                                            padding_idx=self.vocab.idx_pad,
                                            smoothing=label_smoothing)

        if self.cuda:
            self.criterion.cuda()

        self.load_checkpoint()  #loads if exists

        if self.sim_run:
            self.computeloss = ComputeLossSIM(self.criterion, sim_pooling, R,
                                              align_scale, self.optimizer)
        else:
            self.computeloss = ComputeLossMLM(self.model.generator,
                                              self.criterion, self.optimizer)
        token = OpenNMTTokenizer(**opts.cfg['token'])

        logging.info('read Train data')
        self.data_train = DataSet(self.steps,
                                  opts.train['train'],
                                  token,
                                  self.vocab,
                                  sim_run=self.sim_run,
                                  batch_size=batch_size[0],
                                  max_length=max_length,
                                  p_uneven=p_uneven,
                                  swap_bitext=swap_bitext,
                                  allow_shuffle=True,
                                  is_infinite=True)

        if 'valid' in opts.train:
            logging.info('read Valid data')
            self.data_valid = DataSet(self.steps,
                                      opts.train['valid'],
                                      token,
                                      self.vocab,
                                      sim_run=self.sim_run,
                                      batch_size=batch_size[1],
                                      max_length=max_length,
                                      p_uneven=p_uneven,
                                      swap_bitext=swap_bitext,
                                      allow_shuffle=True,
                                      is_infinite=False)
        else:
            self.data_valid = None
Esempio n. 26
0
# encoding=utf-8
"""
Created on 2016年4月18日

@author: lenovo
"""

from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from src.dataset import DataSet

# 贝叶斯分类器对象
for classifier in [BernoulliNB(), MultinomialNB(), GaussianNB()]:
    print("classifier: [%s]" % type(classifier).__name__)
    # 数据类对象
    data = DataSet()

    # 获取带标签训练数据
    train_X = data.get_train_data()
    train_Y = data.get_tag()

    # 训练
    print("start training")
    classifier.fit(train_X, train_Y)
    print("training done")

    # 获取向量化测试数据
    test_X = data.get_test_data()
    # 预测结果
    print("start predicating")
    result = classifier.predict(test_X)
    print("predicate done")
import multiprocessing
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'  # 0, 1, 2, 3  # Disable Tensortflow Logging
os.chdir( os.path.dirname( os.path.abspath(__file__) ) )

import tensorflow.keras as keras
import time

from src.dataset import DataSet
from src.examples.tensorflow import FunctionalCNN, SequentialCNN, ClassCNN, ClassNN
from src.utils.csv import predict_to_csv

timer_start = time.time()

dataset = DataSet()
config = {
    "verbose":      False,
    "epochs":       12,
    "batch_size":   128,
    "input_shape":  dataset.input_shape(),
    "output_shape": dataset.output_shape(),
}
print("config", config)

# BUG: ClassCNN accuracy is only 36% compared to 75% for SequentialCNN / FunctionalCNN
# SequentialCNN   validation: | loss: 1.3756675141198293 | accuracy: 0.7430952
# FunctionalCNN   validation: | loss: 1.4285654685610816 | accuracy: 0.7835714
# ClassCNN        validation: | loss: 1.9851970995040167 | accuracy: 0.36214286
# ClassNN         validation: | loss: 2.302224604288737  | accuracy: 0.09059524
models = {
Esempio n. 28
0
class TestDataSet(TestCase):
    def setUp(self):
        self.init_column_names = ["feat_001", "X-coord", "h,std,dev", "Grade"]
        self.init_features = [[0., 0.1, 3.], [0., 0.2, 0.], [0., 0.3, 0.5]]
        self.init_classes = ["G3", "G1", "G1"]

        self.expected_extracted_column_names = self.init_column_names
        self.expected_extracted_features = [[1.7, 3., 0.09], [-5., -1.12, 0.]]
        self.expected_extracted_classes = ["G2", "G3"]

        self.data_set_dir = path.join(path.dirname(__file__), "data_sets")

        self.data_set = DataSet(X=self.init_features,
                                y=self.init_classes,
                                col_names=self.init_column_names)

    def check_extracted(self):
        self.assertListEqual(self.expected_extracted_column_names,
                             self.data_set.col_names)
        self.assertListEqual(self.expected_extracted_features, self.data_set.X)
        self.assertListEqual(self.expected_extracted_classes, self.data_set.y)

    def test_should_initialize_properly(self):
        self.assertListEqual(self.init_column_names, self.data_set.col_names)
        self.assertListEqual(self.init_features, self.data_set.X)
        self.assertListEqual(self.init_classes, self.data_set.y)

    def test_should_extract_features_and_classes_from_csv_with_header(self):
        with open(path.join(DATA_SETS_DIR, "mock_data_set_with_header.csv"),
                  "r",
                  newline='',
                  encoding="utf8") as csv_file:
            self.data_set.extract_from_csv(csv_file)
        self.check_extracted()

    def test_should_raise_error_on_feature_size_mismatch(self):
        with self.assertRaises(RuntimeError) as cm:
            DataSet(X=[[1], [1, 2]])
        with self.assertRaises(RuntimeError):
            DataSet(y=[1])
        with self.assertRaises(RuntimeError):
            DataSet(X=[[1], [1, 2]], y=[1])
        with self.assertRaises(RuntimeError):
            DataSet(X=[[1], [1, 2]], y=[1, 1])
        with self.assertRaises(RuntimeError):
            DataSet(X=[[1], [1, 2]], y=[1, 1, 1])
        with self.assertRaises(RuntimeError):
            DataSet(col_names=["a"])
        with self.assertRaises(RuntimeError):
            DataSet(X=[[1, 2], [1, 2]], y=[1, 1], col_names=["a"])
        with open(path.join(DATA_SETS_DIR, "mock_data_set_corrupted.csv"),
                  "r",
                  newline='',
                  encoding="utf8") as csv_file:
            with self.assertRaises(RuntimeError):
                self.data_set.extract_from_csv(csv_file)

    def test_should_return_number_of_features(self):
        self.assertEqual(len(self.init_features[0]),
                         self.data_set.number_of_features)

    def test_should_return_column_name(self):
        for index, element in enumerate(self.init_column_names):
            self.assertEqual(element, self.data_set.col_names[index])

    def test_should_create_ranking(self):
        ranking = [2, 1, 0]
        self.assertListEqual(
            self.data_set.create_features_ranking(use_names=False), ranking)