Esempio n. 1
0
def _replace_missing_string(table,
                            input_cols,
                            fill_method=None,
                            fill_string='',
                            limit=None,
                            downcast=None):
    # Validation : limit >= 1
    if limit is not None:
        validate(greater_than_or_equal_to(limit, 1, 'limit'))

    _table = table.copy()

    if input_cols is None or len(input_cols) == 0:
        _raw_input_cols = _table.columns
    else:
        _raw_input_cols = input_cols

    if fill_method == 'ffill' or fill_method == 'bfill':
        _out_table = _table.fillna(method=fill_method,
                                   limit=limit,
                                   downcast=downcast)
    else:
        _input_cols = [x for x in _raw_input_cols if table[x].dtype == object]
        _values = {x: fill_string for x in _input_cols}
        _out_table = _table.fillna(value=_values,
                                   limit=limit,
                                   downcast=downcast)

    return {'out_table': _out_table}
Esempio n. 2
0
def word2vec_similarity(model, **params):
    check_required_parameters(_word2vec_similarity, params, ['model'])
    
    params = get_default_from_parameters_if_required(params, _word2vec_similarity)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'topn')]
    validate(*param_validation_check) 
    return _word2vec_similarity(model, **params)
Esempio n. 3
0
def kernel_density_estimation(table, group_by=None, **params):
    check_required_parameters(_kernel_density_estimation, params, ['table'])
    params = get_default_from_parameters_if_required(
        params, _kernel_density_estimation)
    param_validation_check = [greater_than(params, 0, 'bandwidth')]
    validate(*param_validation_check)
    try:
        points = [np.float64(params['points'])]
    except:
        try:
            points_str = params['points'].split(',')
            points = [np.float64(point) for point in points_str]
        except:
            try:
                p0 = params['points'].split(' to ')
                _from = np.float64(p0[0])
                p1 = p0[1].split(' by ')
                _to = np.float64(p1[0])
                _step = np.float64(p1[1])
                points = np.arange(_from, _to, _step)
            except:
                raise BrighticsFunctionException.from_errors([{
                    '0100':
                    'Points is not of Array[Double] type.'
                }])
    params['points'] = points
    if group_by is not None:
        grouped_model = _function_by_group(_kernel_density_estimation,
                                           table,
                                           group_by=group_by,
                                           **params)
        return grouped_model
    else:
        return _kernel_density_estimation(table, **params)
Esempio n. 4
0
def fasttext_similarity(table, model, **params):
    check_required_parameters(_fasttext_similarity, params, ['table', 'model'])

    params = get_default_from_parameters_if_required(params, _fasttext_similarity)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'topn')]
    validate(*param_validation_check)
    return _fasttext_similarity(table, model, **params)
Esempio n. 5
0
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True,
              probability=True, tol=1e-3, max_iter=-1, random_state=None):
    validate(greater_than(c, 0.0, 'c'))
    
    _table = table.copy()
    
    _feature_cols = _table[feature_cols]
    _label_col = _table[label_col]
    
    if(sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')
    
    _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
              probability=probability, tol=tol, max_iter=max_iter, random_state=random_state)
    _svc_model = _svc.fit(_feature_cols, _label_col)
    
    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_cols
    get_param['label_col'] = label_col
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))
    
    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['_repr_brtc_'] = rb.get()
    
    return {'model':_model}
Esempio n. 6
0
def _replace_missing_number(table, input_cols, fill_method=None, fill_value='value', fill_value_to=0.0, limit=None, downcast=None):
    # Validation : limit >= 1
    if limit is not None:
        validate(greater_than_or_equal_to(limit, 1, 'limit'))

    _table = table.copy()
    
    if input_cols is None or len(input_cols) == 0:
        _raw_input_cols = _table.columns
    else:
        _raw_input_cols = input_cols
    
    if fill_method == 'ffill' or fill_method == 'bfill':
        _out_table = _table
        _out_table[input_cols] = _table[input_cols].fillna(method=fill_method, limit=limit, downcast=downcast)
    else:
        _input_cols = [x for x in _raw_input_cols if np.issubdtype(table[x].dtype, np.number)]
        if fill_value == 'mean':
            _values = {x:_table[x].dtype.type(np.mean(_table[x].dropna())) for x in _input_cols}
        elif fill_value == 'median':
            _values = {x:_table[x].dtype.type(np.median(_table[x].dropna())) for x in _input_cols}
        elif fill_value == 'min':
            _values = {x:np.min(_table[x].dropna()) for x in _input_cols}
        elif fill_value == 'max':
            _values = {x:np.max(_table[x].dropna()) for x in _input_cols}
        else:
            _values = {x:fill_value_to for x in _input_cols}

        _out_table = _table.fillna(value=_values, limit=limit, downcast=downcast)
    return {'out_table':_out_table}
Esempio n. 7
0
def svm_classification_train(table, group_by=None, **params):
    check_required_parameters(_svm_classification_train, params,
                              ['table', 'gamma_val'])
    params = get_default_from_parameters_if_required(
        params, _svm_classification_train)

    if params['gamma'] == 'other':
        if 'gamma_val' not in params:
            raise BFE.from_errors([{
                '0100':
                'Gamma value is mandatory when gamma is other'
            }])
        if params['gamma_val'] <= 0:
            raise BFE.from_errors([{
                '0100': 'Gamma value must be greater than 0'
            }])
    else:
        params['gamma_val'] = None

    param_validation_check = [
        over_to(params, 0.0, 1.0, 'c'),
        greater_than_or_equal_to(params, 0, 'degree'),
        greater_than(params, 0.0, 'tol'),
        greater_than_or_equal_to_or_equal_to(params, 1, -1, 'max_iter')
    ]
    validate(*param_validation_check)

    if group_by is not None:
        grouped_model = _function_by_group(_svm_classification_train,
                                           table,
                                           group_by=group_by,
                                           **params)
        return grouped_model
    else:
        return _svm_classification_train(table, **params)
Esempio n. 8
0
def mlp_classification_train(table, group_by=None, **params):
    check_required_parameters(_mlp_classification_train, params, ['table'])
    params = get_default_from_parameters_if_required(
        params, _mlp_classification_train)
    if (params['batch_size_auto']):
        param_validation_check = [
            greater_than(params, 0.0, 'learning_rate_init'),
            greater_than(params, 0.0, 'tol')
        ]
    else:
        if not params['batch_size'] or not isinstance(params['batch_size'],
                                                      int):
            param_validation_check = [require_param('batch_size')]
            validate(*param_validation_check)
        param_validation_check = [
            greater_than(params, 0, 'batch_size'),
            greater_than(params, 0.0, 'learning_rate_init'),
            greater_than(params, 0.0, 'tol')
        ]
    validate(*param_validation_check)

    if group_by is not None:
        grouped_model = _function_by_group(_mlp_classification_train,
                                           table,
                                           group_by=group_by,
                                           **params)
        return grouped_model
    else:
        return _mlp_classification_train(table, **params)
Esempio n. 9
0
def topic_name_extraction(table, **params):
    check_required_parameters(_topic_name_extraction, params, ['table'])
    params = get_default_from_parameters_if_required(params,
                                                     _topic_name_extraction)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'topn')]
    validate(*param_validation_check)
    return _topic_name_extraction(table, **params)
def decision_tree_classification_train(table, group_by=None, **params):
    check_required_parameters(_decision_tree_classification_train, params,
                              ['table'])

    params = get_default_from_parameters_if_required(
        params, _decision_tree_classification_train)

    param_validation_check = [
        greater_than_or_equal_to(params, 2, 'min_samples_split'),
        greater_than_or_equal_to(params, 1, 'min_samples_leaf'),
        greater_than_or_equal_to(params, 0.0, 'min_weight_fraction_leaf'),
        greater_than_or_equal_to(params, 0.0, 'min_impurity_decrease'),
        greater_than_or_equal_to(params, 1, 'max_depth'),
        greater_than_or_equal_to(params, 1, 'max_features'),
        greater_than_or_equal_to(params, 1, 'max_leaf_nodes')
    ]

    validate(*param_validation_check)

    if group_by is not None:
        grouped_model = _function_by_group(_decision_tree_classification_train,
                                           table,
                                           group_by=group_by,
                                           **params)
        return grouped_model
    else:
        return _decision_tree_classification_train(table, **params)
Esempio n. 11
0
def ngram(table, **params):
    check_required_parameters(_ngram, params, ['table'])

    params = get_default_from_parameters_if_required(params, _ngram)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'n')]
    validate(*param_validation_check)

    return _ngram(table, **params)
Esempio n. 12
0
def moving_average(table, group_by=None, **params):
    check_required_parameters(_moving_average, params, ['table'])
    params = get_default_from_parameters_if_required(params,_moving_average)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'window_size')]
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_moving_average, table, group_by=group_by, **params)
    else:
        return _moving_average(table, **params)
Esempio n. 13
0
def replace_missing_string(table, group_by=None, **params):
    check_required_parameters(_replace_missing_string, params, ['table'])
    params = get_default_from_parameters_if_required(params, _replace_missing_string)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'limit')]
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_replace_missing_string, table, group_by=group_by, **params)
    else:
        return _replace_missing_string(table, **params)
Esempio n. 14
0
def ngram(table, **params):  # to be deprecated
    check_required_parameters(_ngram, params, ['table'])
    params = get_default_from_parameters_if_required(params, _ngram)

    max_len = np.max(np.vectorize(len)(table[params["input_col"]])).item()
    param_validation_check = [from_to(params, 1, max_len, 'n')]

    validate(*param_validation_check)
    return _ngram(table, **params)
Esempio n. 15
0
def add_shift(table, group_by=None, **params):
    check_required_parameters(_add_shift, params, ['table'])
    params = get_default_from_parameters_if_required(params, _add_shift)
    param_validation_check = [all_elements_greater_than_or_equal_to(params, 0, 'shift_list')]
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_add_shift, table, group_by=group_by, **params)
    else:
        return _add_shift(table, **params)
Esempio n. 16
0
def paired_ttest(table, group_by=None, **params):
    check_required_parameters(_paired_ttest, params, ['table'])
    params = get_default_from_parameters_if_required(params, _paired_ttest)
    param_validation_check = [from_to(params, 0, 1, 'confidence_level')]
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_paired_ttest, table, group_by=group_by, **params)
    else:
        return _paired_ttest(table, **params)
Esempio n. 17
0
def naive_bayes_train(table, group_by=None, **params):
    params = get_default_from_parameters_if_required(params,_naive_bayes_train)
    param_validation_check = [greater_than(params, 0, 'alpha')]
        
    validate(*param_validation_check)
    check_required_parameters(_naive_bayes_train, params, ['table'])
    if group_by is not None:
        return _function_by_group(_naive_bayes_train, table, group_by=group_by, **params)
    else:
        return _naive_bayes_train(table, **params)
Esempio n. 18
0
def levenes_test(table, group_by=None, **params):
    check_required_parameters(_levenes_test, params, ['table'])
    params = get_default_from_parameters_if_required(params, _levenes_test)
    if (params['center'] == 'trimmed'):
        param_validation_check = [greater_than_or_equal_to(params, 0.0, 'proportiontocut')]
        validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_levenes_test, table, group_by=group_by, **params)
    else:
        return _levenes_test(table, **params)
Esempio n. 19
0
def timeseries_decomposition(table, group_by=None, **params):
    check_required_parameters(_timeseries_decomposition, params, ['table'])
    params = get_default_from_parameters_if_required(params, _timeseries_decomposition)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'frequency'),
                              greater_than_or_equal_to(params, 0, 'extrapolate_trend')]
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_timeseries_decomposition, table, group_by=group_by, **params)
    else:
        return _timeseries_decomposition(table, **params)
Esempio n. 20
0
def two_sample_ttest_for_stacked_data(table, group_by=None, **params):
    params = get_default_from_parameters_if_required(params, _two_sample_ttest_for_stacked_data)
    param_validation_check = [from_to(params, 0, 1, 'confi_level')]
        
    validate(*param_validation_check)
    check_required_parameters(_two_sample_ttest_for_stacked_data, params, ['table'])
    if group_by is not None:
        return _function_by_group(_two_sample_ttest_for_stacked_data, table, group_by=group_by, **params)
    else:
        return _two_sample_ttest_for_stacked_data(table, **params)
Esempio n. 21
0
def knn_regression(train_table, test_table, **params):
    check_required_parameters(_knn_regression, params, ['train_table', 'test_table'])

    params = get_default_from_parameters_if_required(params,_knn_regression)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'k'),
                              greater_than_or_equal_to(params, 1, 'leaf_size'),
                              greater_than_or_equal_to(params, 1, 'p')]
    validate(*param_validation_check)

    return _knn_regression(train_table, test_table, **params)
Esempio n. 22
0
def ewma(table, group_by=None, **params):
    check_required_parameters(_ewma, params, ['table'])
    params = get_default_from_parameters_if_required(params,_ewma)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'period_number'),
                              from_to(params, 0, 1, 'custom_ratio')]
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_ewma, table, group_by=group_by, **params)
    else:
        return _ewma(table, **params)
Esempio n. 23
0
def outlier_detection_tukey_carling(table, group_by=None, **params):
    check_required_parameters(_outlier_detection_tukey_carling, params, ['table'])
    params = get_default_from_parameters_if_required(params, _outlier_detection_tukey_carling)
    param_validation_check = [greater_than(params, 0.0, 'multiplier')]
    validate(*param_validation_check)
    
    if group_by is not None:
        return _function_by_group(_outlier_detection_tukey_carling, table, group_by=group_by, **params)
    else:
        return _outlier_detection_tukey_carling(table, **params)
Esempio n. 24
0
def outlier_detection_lof(table, group_by=None, **params):
    check_required_parameters(_outlier_detection_lof, params, ['table'])
    params = get_default_from_parameters_if_required(params, _outlier_detection_lof)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'n_neighbors')]
        
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_outlier_detection_lof, table, group_by=group_by, **params)
    else:
        return _outlier_detection_lof(table, **params)
Esempio n. 25
0
def tsne(table, group_by=None, **params):
    check_required_parameters(_tsne, params, ['table'])
    params = get_default_from_parameters_if_required(params, _tsne)
    param_validation_check = [from_to(params, 1, len(params['input_cols']), 'n_components')]
    validate(*param_validation_check)
    if group_by is not None:
        grouped_model = _function_by_group(_tsne, table, group_by=group_by, **params)
        return grouped_model
    else:
        return _tsne(table, **params)
Esempio n. 26
0
def statistic_derivation(table, group_by=None, **params):
    check_required_parameters(_statistic_derivation, params, ['table'])
    params = get_default_from_parameters_if_required(params,_statistic_derivation)
    param_validation_check = [all_elements_from_to(params, 0, 100, 'percentile_amounts'),
                              all_elements_from_under(params, 0, 0.5, 'trimmed_mean_amounts')]
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_statistic_derivation, table, group_by=group_by, **params)
    else:
        return _statistic_derivation(table, **params)
Esempio n. 27
0
def doc_summarizer_eng(table, **params):
    check_required_parameters(_doc_summarizer_eng, params, ['table'])
    params = get_default_from_parameters_if_required(params,
                                                     _doc_summarizer_eng)
    param_validation_check = [
        greater_than(params, 0, 'ratio'),
        greater_than_or_equal_to(params, 1, 'num_sentence')
    ]
    validate(*param_validation_check)
    return _doc_summarizer_eng(table, **params)
Esempio n. 28
0
def _search(table,
            user_dict=pd.DataFrame(),
            input_cols=[],
            search_words=[],
            synonym_dict=[],
            main_operator='and'):

    if len(search_words) == 0:
        raise BrighticsFunctionException('0033', 'Search Words')

    for search_word in search_words:
        if search_word is None:
            raise BrighticsFunctionException('0033', 'Search Words')

    _table = table.copy()

    filter_list = []
    if len(input_cols) == 0:
        validate(require_param('input_cols'))
    for _list in product(input_cols, search_words):
        c, od = _list
        filter_list.append([c, od.strip('\'')])
    _out_table = _table

    filtered_set = set(_out_table.index)

    cond = np.full(len(_table), True).tolist()
    for _filter in filter_list:
        cond = (cond) & (_table[_filter[0]].str.contains(_filter[1]))
    _out_table = _table.loc[list(
        filtered_set.intersection(set(_table[cond].index)))]

    if len(user_dict.index) != 0:
        filter_list = []
        search_words = [
            user_dict['value'][i] for i, key in enumerate(user_dict['key'])
            if key in search_words
        ]
        print(search_words)
        for _list in product(input_cols, search_words):
            c, od = _list
            filter_list.append([c, od.strip('\'')])

        filtered_set = set()

        syno_cond = np.full(len(_table), False).tolist()
        for _filter in filter_list:
            syno_cond = (syno_cond) | (_table[_filter[0]].str.contains(
                _filter[1]))

        syno_cond = syno_cond | cond
        _out_table = _table.loc[list(
            filtered_set.union(set(_table[syno_cond].index)))]

    return {'out_table': _out_table}
Esempio n. 29
0
def discretize_quantile(table, group_by=None, **params):
    check_required_parameters(_discretize_quantile, params, ['table'])

    params = get_default_from_parameters_if_required(params, _discretize_quantile)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'num_of_buckets')]
    validate(*param_validation_check)

    if group_by is not None:
        return _function_by_group(_discretize_quantile, table, group_by=group_by, **params)
    else:
        return _discretize_quantile(table, **params)
Esempio n. 30
0
def association_rule(table, group_by=None, **params):
    check_required_parameters(_association_rule, params, ['table'])
    params = get_default_from_parameters_if_required(params, _association_rule)
    param_validation_check = [from_to(params, 0, 1, 'min_support'),
                              from_to(params, 0, 1, 'min_confidence')]
        
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_association_rule, table, group_by=group_by, **params)
    else:
        return _association_rule(table, **params)