def _replace_missing_string(table, input_cols, fill_method=None, fill_string='', limit=None, downcast=None): # Validation : limit >= 1 if limit is not None: validate(greater_than_or_equal_to(limit, 1, 'limit')) _table = table.copy() if input_cols is None or len(input_cols) == 0: _raw_input_cols = _table.columns else: _raw_input_cols = input_cols if fill_method == 'ffill' or fill_method == 'bfill': _out_table = _table.fillna(method=fill_method, limit=limit, downcast=downcast) else: _input_cols = [x for x in _raw_input_cols if table[x].dtype == object] _values = {x: fill_string for x in _input_cols} _out_table = _table.fillna(value=_values, limit=limit, downcast=downcast) return {'out_table': _out_table}
def word2vec_similarity(model, **params): check_required_parameters(_word2vec_similarity, params, ['model']) params = get_default_from_parameters_if_required(params, _word2vec_similarity) param_validation_check = [greater_than_or_equal_to(params, 1, 'topn')] validate(*param_validation_check) return _word2vec_similarity(model, **params)
def kernel_density_estimation(table, group_by=None, **params): check_required_parameters(_kernel_density_estimation, params, ['table']) params = get_default_from_parameters_if_required( params, _kernel_density_estimation) param_validation_check = [greater_than(params, 0, 'bandwidth')] validate(*param_validation_check) try: points = [np.float64(params['points'])] except: try: points_str = params['points'].split(',') points = [np.float64(point) for point in points_str] except: try: p0 = params['points'].split(' to ') _from = np.float64(p0[0]) p1 = p0[1].split(' by ') _to = np.float64(p1[0]) _step = np.float64(p1[1]) points = np.arange(_from, _to, _step) except: raise BrighticsFunctionException.from_errors([{ '0100': 'Points is not of Array[Double] type.' }]) params['points'] = points if group_by is not None: grouped_model = _function_by_group(_kernel_density_estimation, table, group_by=group_by, **params) return grouped_model else: return _kernel_density_estimation(table, **params)
def fasttext_similarity(table, model, **params): check_required_parameters(_fasttext_similarity, params, ['table', 'model']) params = get_default_from_parameters_if_required(params, _fasttext_similarity) param_validation_check = [greater_than_or_equal_to(params, 1, 'topn')] validate(*param_validation_check) return _fasttext_similarity(table, model, **params)
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None): validate(greater_than(c, 0.0, 'c')) _table = table.copy() _feature_cols = _table[feature_cols] _label_col = _table[label_col] if(sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'): raise_runtime_error('''Label Column should not be continuous.''') _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state) _svc_model = _svc.fit(_feature_cols, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_cols get_param['label_col'] = label_col rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['_repr_brtc_'] = rb.get() return {'model':_model}
def _replace_missing_number(table, input_cols, fill_method=None, fill_value='value', fill_value_to=0.0, limit=None, downcast=None): # Validation : limit >= 1 if limit is not None: validate(greater_than_or_equal_to(limit, 1, 'limit')) _table = table.copy() if input_cols is None or len(input_cols) == 0: _raw_input_cols = _table.columns else: _raw_input_cols = input_cols if fill_method == 'ffill' or fill_method == 'bfill': _out_table = _table _out_table[input_cols] = _table[input_cols].fillna(method=fill_method, limit=limit, downcast=downcast) else: _input_cols = [x for x in _raw_input_cols if np.issubdtype(table[x].dtype, np.number)] if fill_value == 'mean': _values = {x:_table[x].dtype.type(np.mean(_table[x].dropna())) for x in _input_cols} elif fill_value == 'median': _values = {x:_table[x].dtype.type(np.median(_table[x].dropna())) for x in _input_cols} elif fill_value == 'min': _values = {x:np.min(_table[x].dropna()) for x in _input_cols} elif fill_value == 'max': _values = {x:np.max(_table[x].dropna()) for x in _input_cols} else: _values = {x:fill_value_to for x in _input_cols} _out_table = _table.fillna(value=_values, limit=limit, downcast=downcast) return {'out_table':_out_table}
def svm_classification_train(table, group_by=None, **params): check_required_parameters(_svm_classification_train, params, ['table', 'gamma_val']) params = get_default_from_parameters_if_required( params, _svm_classification_train) if params['gamma'] == 'other': if 'gamma_val' not in params: raise BFE.from_errors([{ '0100': 'Gamma value is mandatory when gamma is other' }]) if params['gamma_val'] <= 0: raise BFE.from_errors([{ '0100': 'Gamma value must be greater than 0' }]) else: params['gamma_val'] = None param_validation_check = [ over_to(params, 0.0, 1.0, 'c'), greater_than_or_equal_to(params, 0, 'degree'), greater_than(params, 0.0, 'tol'), greater_than_or_equal_to_or_equal_to(params, 1, -1, 'max_iter') ] validate(*param_validation_check) if group_by is not None: grouped_model = _function_by_group(_svm_classification_train, table, group_by=group_by, **params) return grouped_model else: return _svm_classification_train(table, **params)
def mlp_classification_train(table, group_by=None, **params): check_required_parameters(_mlp_classification_train, params, ['table']) params = get_default_from_parameters_if_required( params, _mlp_classification_train) if (params['batch_size_auto']): param_validation_check = [ greater_than(params, 0.0, 'learning_rate_init'), greater_than(params, 0.0, 'tol') ] else: if not params['batch_size'] or not isinstance(params['batch_size'], int): param_validation_check = [require_param('batch_size')] validate(*param_validation_check) param_validation_check = [ greater_than(params, 0, 'batch_size'), greater_than(params, 0.0, 'learning_rate_init'), greater_than(params, 0.0, 'tol') ] validate(*param_validation_check) if group_by is not None: grouped_model = _function_by_group(_mlp_classification_train, table, group_by=group_by, **params) return grouped_model else: return _mlp_classification_train(table, **params)
def topic_name_extraction(table, **params): check_required_parameters(_topic_name_extraction, params, ['table']) params = get_default_from_parameters_if_required(params, _topic_name_extraction) param_validation_check = [greater_than_or_equal_to(params, 1, 'topn')] validate(*param_validation_check) return _topic_name_extraction(table, **params)
def decision_tree_classification_train(table, group_by=None, **params): check_required_parameters(_decision_tree_classification_train, params, ['table']) params = get_default_from_parameters_if_required( params, _decision_tree_classification_train) param_validation_check = [ greater_than_or_equal_to(params, 2, 'min_samples_split'), greater_than_or_equal_to(params, 1, 'min_samples_leaf'), greater_than_or_equal_to(params, 0.0, 'min_weight_fraction_leaf'), greater_than_or_equal_to(params, 0.0, 'min_impurity_decrease'), greater_than_or_equal_to(params, 1, 'max_depth'), greater_than_or_equal_to(params, 1, 'max_features'), greater_than_or_equal_to(params, 1, 'max_leaf_nodes') ] validate(*param_validation_check) if group_by is not None: grouped_model = _function_by_group(_decision_tree_classification_train, table, group_by=group_by, **params) return grouped_model else: return _decision_tree_classification_train(table, **params)
def ngram(table, **params): check_required_parameters(_ngram, params, ['table']) params = get_default_from_parameters_if_required(params, _ngram) param_validation_check = [greater_than_or_equal_to(params, 1, 'n')] validate(*param_validation_check) return _ngram(table, **params)
def moving_average(table, group_by=None, **params): check_required_parameters(_moving_average, params, ['table']) params = get_default_from_parameters_if_required(params,_moving_average) param_validation_check = [greater_than_or_equal_to(params, 1, 'window_size')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_moving_average, table, group_by=group_by, **params) else: return _moving_average(table, **params)
def replace_missing_string(table, group_by=None, **params): check_required_parameters(_replace_missing_string, params, ['table']) params = get_default_from_parameters_if_required(params, _replace_missing_string) param_validation_check = [greater_than_or_equal_to(params, 1, 'limit')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_replace_missing_string, table, group_by=group_by, **params) else: return _replace_missing_string(table, **params)
def ngram(table, **params): # to be deprecated check_required_parameters(_ngram, params, ['table']) params = get_default_from_parameters_if_required(params, _ngram) max_len = np.max(np.vectorize(len)(table[params["input_col"]])).item() param_validation_check = [from_to(params, 1, max_len, 'n')] validate(*param_validation_check) return _ngram(table, **params)
def add_shift(table, group_by=None, **params): check_required_parameters(_add_shift, params, ['table']) params = get_default_from_parameters_if_required(params, _add_shift) param_validation_check = [all_elements_greater_than_or_equal_to(params, 0, 'shift_list')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_add_shift, table, group_by=group_by, **params) else: return _add_shift(table, **params)
def paired_ttest(table, group_by=None, **params): check_required_parameters(_paired_ttest, params, ['table']) params = get_default_from_parameters_if_required(params, _paired_ttest) param_validation_check = [from_to(params, 0, 1, 'confidence_level')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_paired_ttest, table, group_by=group_by, **params) else: return _paired_ttest(table, **params)
def naive_bayes_train(table, group_by=None, **params): params = get_default_from_parameters_if_required(params,_naive_bayes_train) param_validation_check = [greater_than(params, 0, 'alpha')] validate(*param_validation_check) check_required_parameters(_naive_bayes_train, params, ['table']) if group_by is not None: return _function_by_group(_naive_bayes_train, table, group_by=group_by, **params) else: return _naive_bayes_train(table, **params)
def levenes_test(table, group_by=None, **params): check_required_parameters(_levenes_test, params, ['table']) params = get_default_from_parameters_if_required(params, _levenes_test) if (params['center'] == 'trimmed'): param_validation_check = [greater_than_or_equal_to(params, 0.0, 'proportiontocut')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_levenes_test, table, group_by=group_by, **params) else: return _levenes_test(table, **params)
def timeseries_decomposition(table, group_by=None, **params): check_required_parameters(_timeseries_decomposition, params, ['table']) params = get_default_from_parameters_if_required(params, _timeseries_decomposition) param_validation_check = [greater_than_or_equal_to(params, 1, 'frequency'), greater_than_or_equal_to(params, 0, 'extrapolate_trend')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_timeseries_decomposition, table, group_by=group_by, **params) else: return _timeseries_decomposition(table, **params)
def two_sample_ttest_for_stacked_data(table, group_by=None, **params): params = get_default_from_parameters_if_required(params, _two_sample_ttest_for_stacked_data) param_validation_check = [from_to(params, 0, 1, 'confi_level')] validate(*param_validation_check) check_required_parameters(_two_sample_ttest_for_stacked_data, params, ['table']) if group_by is not None: return _function_by_group(_two_sample_ttest_for_stacked_data, table, group_by=group_by, **params) else: return _two_sample_ttest_for_stacked_data(table, **params)
def knn_regression(train_table, test_table, **params): check_required_parameters(_knn_regression, params, ['train_table', 'test_table']) params = get_default_from_parameters_if_required(params,_knn_regression) param_validation_check = [greater_than_or_equal_to(params, 1, 'k'), greater_than_or_equal_to(params, 1, 'leaf_size'), greater_than_or_equal_to(params, 1, 'p')] validate(*param_validation_check) return _knn_regression(train_table, test_table, **params)
def ewma(table, group_by=None, **params): check_required_parameters(_ewma, params, ['table']) params = get_default_from_parameters_if_required(params,_ewma) param_validation_check = [greater_than_or_equal_to(params, 1, 'period_number'), from_to(params, 0, 1, 'custom_ratio')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_ewma, table, group_by=group_by, **params) else: return _ewma(table, **params)
def outlier_detection_tukey_carling(table, group_by=None, **params): check_required_parameters(_outlier_detection_tukey_carling, params, ['table']) params = get_default_from_parameters_if_required(params, _outlier_detection_tukey_carling) param_validation_check = [greater_than(params, 0.0, 'multiplier')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_outlier_detection_tukey_carling, table, group_by=group_by, **params) else: return _outlier_detection_tukey_carling(table, **params)
def outlier_detection_lof(table, group_by=None, **params): check_required_parameters(_outlier_detection_lof, params, ['table']) params = get_default_from_parameters_if_required(params, _outlier_detection_lof) param_validation_check = [greater_than_or_equal_to(params, 1, 'n_neighbors')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_outlier_detection_lof, table, group_by=group_by, **params) else: return _outlier_detection_lof(table, **params)
def tsne(table, group_by=None, **params): check_required_parameters(_tsne, params, ['table']) params = get_default_from_parameters_if_required(params, _tsne) param_validation_check = [from_to(params, 1, len(params['input_cols']), 'n_components')] validate(*param_validation_check) if group_by is not None: grouped_model = _function_by_group(_tsne, table, group_by=group_by, **params) return grouped_model else: return _tsne(table, **params)
def statistic_derivation(table, group_by=None, **params): check_required_parameters(_statistic_derivation, params, ['table']) params = get_default_from_parameters_if_required(params,_statistic_derivation) param_validation_check = [all_elements_from_to(params, 0, 100, 'percentile_amounts'), all_elements_from_under(params, 0, 0.5, 'trimmed_mean_amounts')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_statistic_derivation, table, group_by=group_by, **params) else: return _statistic_derivation(table, **params)
def doc_summarizer_eng(table, **params): check_required_parameters(_doc_summarizer_eng, params, ['table']) params = get_default_from_parameters_if_required(params, _doc_summarizer_eng) param_validation_check = [ greater_than(params, 0, 'ratio'), greater_than_or_equal_to(params, 1, 'num_sentence') ] validate(*param_validation_check) return _doc_summarizer_eng(table, **params)
def _search(table, user_dict=pd.DataFrame(), input_cols=[], search_words=[], synonym_dict=[], main_operator='and'): if len(search_words) == 0: raise BrighticsFunctionException('0033', 'Search Words') for search_word in search_words: if search_word is None: raise BrighticsFunctionException('0033', 'Search Words') _table = table.copy() filter_list = [] if len(input_cols) == 0: validate(require_param('input_cols')) for _list in product(input_cols, search_words): c, od = _list filter_list.append([c, od.strip('\'')]) _out_table = _table filtered_set = set(_out_table.index) cond = np.full(len(_table), True).tolist() for _filter in filter_list: cond = (cond) & (_table[_filter[0]].str.contains(_filter[1])) _out_table = _table.loc[list( filtered_set.intersection(set(_table[cond].index)))] if len(user_dict.index) != 0: filter_list = [] search_words = [ user_dict['value'][i] for i, key in enumerate(user_dict['key']) if key in search_words ] print(search_words) for _list in product(input_cols, search_words): c, od = _list filter_list.append([c, od.strip('\'')]) filtered_set = set() syno_cond = np.full(len(_table), False).tolist() for _filter in filter_list: syno_cond = (syno_cond) | (_table[_filter[0]].str.contains( _filter[1])) syno_cond = syno_cond | cond _out_table = _table.loc[list( filtered_set.union(set(_table[syno_cond].index)))] return {'out_table': _out_table}
def discretize_quantile(table, group_by=None, **params): check_required_parameters(_discretize_quantile, params, ['table']) params = get_default_from_parameters_if_required(params, _discretize_quantile) param_validation_check = [greater_than_or_equal_to(params, 1, 'num_of_buckets')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_discretize_quantile, table, group_by=group_by, **params) else: return _discretize_quantile(table, **params)
def association_rule(table, group_by=None, **params): check_required_parameters(_association_rule, params, ['table']) params = get_default_from_parameters_if_required(params, _association_rule) param_validation_check = [from_to(params, 0, 1, 'min_support'), from_to(params, 0, 1, 'min_confidence')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_association_rule, table, group_by=group_by, **params) else: return _association_rule(table, **params)