Beispiel #1
0
def test_results_similarity(n_samples):
    x, y = make_classification(n_samples=n_samples,
                               n_features=4,
                               random_state=RANDOM_STATE)
    d4p_res = d4p_train_test_split(x,
                                   y,
                                   test_size=n_samples // 2 - 1,
                                   train_size=n_samples // 2 - 1,
                                   random_state=RANDOM_STATE)
    skl_res = skl_train_test_split(x,
                                   y,
                                   test_size=n_samples // 2 - 1,
                                   train_size=n_samples // 2 - 1,
                                   random_state=RANDOM_STATE)

    assert len(d4p_res) == len(
        skl_res), 'train_test_splits have different output size'

    for i, _ in enumerate(d4p_res):
        assert np.all(d4p_res[i] ==
                      skl_res[i]), 'train_test_splits have different output'
Beispiel #2
0
    pyplot.plot(days[:past_len + 1], fiction_past)
    pyplot.plot(days[past_len + 1:], predictions)
    pyplot.legend([f'{fiction_name} data', 'Predictions'])
    pyplot.show()


if __name__ == '__main__':
    process_args()
    cases_data = load_the_data('../cases-data-2020-04-28.csv',
                               do_correct_known_typos=True)
    sequences = extract_unmodified_sequences_from_first_case(cases_data)
    variations = compute_sequence_variations(sequences,
                                             max_ratio_value=max_ratio_value)
    train_sequences, test_sequences = skl_train_test_split(
        variations,
        test_size=test_data_size,
        shuffle=True,
        random_state=random_seed)
    train_data_past, train_data_future = prepare_data_for_the_model(
        train_sequences, past_len, predictions_len, do_remove_zero_data=False)
    test_data_past, test_data_future = prepare_data_for_the_model(
        test_sequences, past_len, predictions_len, do_remove_zero_data=False)
    regularizer = None
    if (regularizer_type == 'L1'):
        regularizer = tfk_regularizers.l1(l=regularizer_amplitude)
    elif (regularizer_type == 'L2'):
        regularizer = tfk_regularizers.l2(l=regularizer_amplitude)
    lstm_model = create_and_prepare_model(
        input_len=past_len,
        nb_hidden_features=nb_hidden_features,
        predictions_len=predictions_len,
Beispiel #3
0
def test_model_regressor(pd_assert_equal):
    init_kwargs_settings = [
        {},
        {
            'normalize': True
        },
    ]
    fit_kwargs_settings = [{}]
    evaluate_kwargs_settings = [{
        'train_test_split_func': '[email protected]_selection',
        'train_test_split_func_kwargs': {
            'random_state': 0
        },
        'metrics': 'all',
    }]
    dataset_config = {
        'loader_config': {
            'name': '*****@*****.**',
            'kwargs': {
                'random_state': 0,
            },
        },
        'target_column': 'target',
    }
    preprocessor = get_preprocessor({
        'name': 'identify',
        'kwargs': {},
    })
    ffunc_for_predictor = ffunc_for_target = load_object_by_str(
        '*****@*****.**')
    ds1 = get_dataset(dataset_config)
    X1, y1 = ds1.get_predictor_target()

    for init_kwargs in init_kwargs_settings:
        for fit_kwargs in fit_kwargs_settings:
            for evaluate_kwargs in evaluate_kwargs_settings:
                mconfig = {
                    'name': 'SklearnLinearRegression',
                    'init_kwargs': init_kwargs,
                    'fit_kwargs': fit_kwargs,
                    'evaluate_kwargs': evaluate_kwargs,
                    'is_rebuild': False,
                }
                m = get_model(mconfig)
                morigin = LinearRegression(**init_kwargs)
                m.fit(X1, y1)
                morigin.fit(X1, y1, **fit_kwargs)

                # assertion
                pd_assert_equal(
                    pd.Series(m.predict(X1)).astype('float64'),
                    pd.Series(morigin.predict(X1)).astype('float64'))
                X_train, X_test, y_train, y_test = skl_train_test_split(
                    X1, y1, **evaluate_kwargs['train_test_split_func_kwargs'])
                rev1 = m.evaluate(X1, y1, preprocessor, ffunc_for_predictor,
                                  ffunc_for_target)
                if not rev1['cv']:
                    met = rev1['metrics'][0]
                    mean_absolute_error = [
                        o['value'] for o in met
                        if o['name'] == 'mean_absolute_error'
                    ][0]
                    mean_squared_error = [
                        o['value'] for o in met
                        if o['name'] == 'mean_squared_error'
                    ][0]
                    median_absolute_error = [
                        o['value'] for o in met
                        if o['name'] == 'median_absolute_error'
                    ][0]
                    r2_score = [
                        o['value'] for o in met if o['name'] == 'r2_score'
                    ][0]
                    explained_variance = [
                        o['value'] for o in met
                        if o['name'] == 'explained_variance'
                    ][0]
                    morigin.fit(ffunc_for_predictor(X_train),
                                ffunc_for_target(y_train))
                    assert math.fabs(
                        mean_absolute_error - skl_metrics.mean_absolute_error(
                            y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(
                        mean_squared_error - skl_metrics.mean_squared_error(
                            y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(
                        median_absolute_error -
                        skl_metrics.median_absolute_error(
                            y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(r2_score - skl_metrics.r2_score(
                        y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(
                        explained_variance -
                        skl_metrics.explained_variance_score(
                            y_test, morigin.predict(X_test))) < 0.00001
Beispiel #4
0
 def _train_test_split(self, data, labels):
     return skl_train_test_split(data,
                                 labels,
                                 shuffle=True,
                                 random_state=5,
                                 test_size=0.2)
Beispiel #5
0
  if(cut is not None):
    data = data[ : cut ]
  data['label'] = data['label'].apply(np.long)
  if(do_convert_label_9_to_0):
    data['label'] = data['label'].apply(lambda x : 0 if x == 9 else x)
  else :
    data['label'] = data['label'].apply(lambda x : 2 if x == 9 else x)
  label_list = np.sort(pd.unique(data['label']))
  assert(list(label_list) ==([ 0, 1 ] if do_convert_label_9_to_0 else[ 0, 1, 2 ]))
  data_features = data['text'].apply(prepare_text)
  data_labels = data['label']
  del data
 ( train_features_series,
    test_features_series,
    train_labels_series,
    test_labels_series ) = skl_train_test_split(data_features, data_labels, test_size = test_size)
  train_features = tokenize_text_series(bert_model_name, train_features_series)
  test_features = tokenize_text_series(bert_model_name, test_features_series)
  train_labels = train_labels_series #. apply(lambda r : torch.tensor(r).long())
  test_labels = test_labels_series #. apply(lambda r : torch.tensor(r).long())
  del train_features_series
  del test_features_series
  del train_labels_series
  del test_labels_series
  return train_features, test_features, train_labels, test_labels, nb_classes





Beispiel #6
0
# FASTTEXT


def save_fasttext_data(file_path, tweet_data):
  fasttext_file = open(file_path, 'w')
  nb_items = len(tweet_data)
  for row_index in range(nb_items):
    row = tweet_data.iloc[row_index]
    label = row['label']
    line = '__label__'
    line += 'no' if(label < 0.5) else('yes' if(label < 1.5) else 'not_sure')
    line += ' '
    fasttext_file.write(line + row['prepared_text'] + '\n')


tweet_train_data, tweet_test_data = skl_train_test_split(data_tweets, test_size = 0.2, shuffle = True)

fasttext_train_file_path = fasttext_file_name + '-train.txt'
save_fasttext_data(fasttext_train_file_path, tweet_train_data)
fasttext_test_file_path = fasttext_file_name + '-test.txt'
save_fasttext_data(fasttext_test_file_path, tweet_test_data)



if(do_try_fasttext_train):
  fasttext_model = fasttext.train_supervised(input = fasttext_train_file_path,
                                              lr = fasttext_learning_rate,
                                              epoch = fasttext_nb_epochs,
                                              wordNgrams = fasttext_n_gram_max)
  fasttext_model.save_model(fasttext_model_file_path)
  # very disappointing: